Ejemplo n.º 1
0
        static void Main(string[] args)
        {
            PdfReader reader = new PdfReader(@"D:\191.pdf");

            IEnumerable <string> GetColumnText(float llx, float lly, float urx, float ury)

            {
                int get_PageNum = reader.NumberOfPages;

                for (int pagecount = 1; pagecount <= get_PageNum; pagecount++)
                {
                    var rect         = new iTextSharp.text.Rectangle(llx, lly, urx, ury);
                    var renderFilter = new RenderFilter[1];
                    renderFilter[0] = new RegionTextRenderFilter(rect);
                    var textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter);
                    var text = PdfTextExtractor.GetTextFromPage(reader, pagecount, textExtractionStrategy);
                    yield return(text);
                }
            }

            foreach (string result in GetColumnText(0, 0, 500, 500000))
            {
                Console.Write("{0} ", result);
                Console.ReadLine();
            }
        }
Ejemplo n.º 2
0
        public static string ExtractText(this PdfReader pr, int pageI, RenderFilter[] f)
        {
            ITextExtractionStrategy s = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), f);

            return(PdfTextExtractor.GetTextFromPage(pr, pageI, new LimitedTextStrategy2(s)));
            //return PdfTextExtractor.GetTextFromPage(pr, pageI, new LimitedTextStrategy(new LocationTextExtractionStrategy(), f));
        }
Ejemplo n.º 3
0
        public List <string> getTextByCoOrdinate(PdfReader reader, int pageNumber, int cordinate1, int coordinate2, int coordinate3, int coordinate4)
        {
            List <string> data = new List <string>();

            iTextSharp.text.Rectangle rect         = new iTextSharp.text.Rectangle(cordinate1, coordinate2, coordinate3, coordinate4);
            RenderFilter[]            renderFilter = new RenderFilter[1];
            renderFilter[0] = new RegionTextRenderFilter(rect);
            ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter);

            string text = PdfTextExtractor.GetTextFromPage(reader, pageNumber, textExtractionStrategy);

            string[] words = text.Split('\n');

            foreach (var x in words)
            {
                if (!string.IsNullOrWhiteSpace(x))
                {
                    data.Add(x.Trim());
                }
            }

            foreach (var y in data)
            {
                Console.WriteLine(y);
            }

            return(data);
        }
Ejemplo n.º 4
0
        //Extract word by location(rect)/
        string GetTextByLocation(int page, RectangleJ area, bool landscape)
        {
            const float dpi             = 72.0f;
            float       landscapeHeight = 8.23f;
            RectangleJ  location        = new RectangleJ(area.X, area.Y, area.Width, area.Height);

            if (landscape)
            {
                location.X      = landscapeHeight - area.Y - area.Height;
                location.Y      = area.X;
                location.Width  = area.Height;
                location.Height = area.Width;
            }

            location.X      *= dpi;
            location.Y      *= dpi;
            location.Width  *= dpi;
            location.Height *= dpi;

            RenderFilter[]          filter = { new RegionTextRenderFilter(location) };
            ITextExtractionStrategy strategy;
            StringBuilder           text = new StringBuilder();

            using (PdfReader reader = new PdfReader(filepath))
            {
                strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter);
                text.AppendLine(PdfTextExtractor.GetTextFromPage(reader, page, strategy));
            }
            return(text.ToString());
        }
        public void GetRectAnno()
        {
            string appRootDir = new DirectoryInfo(Environment.CurrentDirectory).Parent.Parent.FullName;

            string filePath = outputFile;
            int    pageFrom = 0;
            int    pageTo   = 0;

            listaRectangulos.Clear();
            try
            {
                using (PdfReader reader = new PdfReader(filePath))
                {
                    pageTo = reader.NumberOfPages;

                    for (int i = 1; i <= reader.NumberOfPages; i++)
                    {
                        PdfDictionary page   = reader.GetPageN(i);
                        PdfArray      annots = page.GetAsArray(iTextSharp.text.pdf.PdfName.ANNOTS);
                        if (annots != null)
                        {
                            foreach (PdfObject annot in annots.ArrayList)
                            {
                                //abtiene Annotation de PDF File
                                PdfDictionary annotationDic = (PdfDictionary)PdfReader.GetPdfObject(annot);
                                PdfName       subType       = (PdfName)annotationDic.Get(PdfName.SUBTYPE);
                                //solo el subtype highlight
                                if (subType.Equals(PdfName.HIGHLIGHT))
                                {
                                    // Obtiene Quadpoints y Rectángulo de texto resaltado
                                    //Console.Write("HighLight at Rectangle {0} with QuadPoints {1}\n", annotationDic.GetAsArray(PdfName.RECT), annotationDic.GetAsArray(PdfName.QUADPOINTS));


                                    outputData = "HighLight at Rectangle {0} with QuadPoints {1}\n" + annotationDic.GetAsArray(PdfName.RECT) + annotationDic.GetAsArray(PdfName.QUADPOINTS);
                                    //Extraer texto usando la estrategia de rectángulo
                                    PdfArray coordinates = annotationDic.GetAsArray(PdfName.RECT);

                                    iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(float.Parse(coordinates.ArrayList[0].ToString(), CultureInfo.InvariantCulture.NumberFormat), float.Parse(coordinates.ArrayList[1].ToString(), CultureInfo.InvariantCulture.NumberFormat),
                                                                                                   float.Parse(coordinates.ArrayList[2].ToString(), CultureInfo.InvariantCulture.NumberFormat), float.Parse(coordinates.ArrayList[3].ToString(), CultureInfo.InvariantCulture.NumberFormat));

                                    listaRectangulos.Add(rect);


                                    RenderFilter[]          filter = { new RegionTextRenderFilter(rect) };
                                    ITextExtractionStrategy strategy;
                                    StringBuilder           sb = new StringBuilder();


                                    strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter);
                                    sb.AppendLine(PdfTextExtractor.GetTextFromPage(reader, i, strategy));
                                }
                            }
                        }
                    }
                }
            }
            catch (Exception ex)
            {
            }
        }
Ejemplo n.º 6
0
        /// <summary>
        /// Gets the contents text from page to put in TOC.
        /// </summary>
        /// <returns>The name of the page.</returns>
        /// <param name="pdfPath">The Pdf to read from.</param>
        /// <param name="page">The pages to read from.</param>
        public static string GetContentsTextFromPage(String pdfPath, int page)
        {
            PdfReader    reader = new PdfReader(pdfPath);
            StringWriter output = new StringWriter();

            //Create rectangle to read from header
            Rectangle mediabox = reader.GetPageSize(page);
            float     llx      = mediabox.GetRight(10f) - 100f;
            float     urx      = mediabox.GetRight(0f);
            float     lly      = mediabox.GetTop(10f) - 50f;
            float     ury      = mediabox.GetTop(0f);
            Rectangle rect     = new Rectangle(llx, lly, urx, ury);

            //The header contains the name of the page. Read from Heaedr.
            RenderFilter            regionFilter = new RegionTextRenderFilter(rect);
            ITextExtractionStrategy strategy     = new FilteredTextRenderListener(
                new LocationTextExtractionStrategy(), regionFilter);

            output.WriteLine(PdfTextExtractor.GetTextFromPage(reader, page, strategy));
            Console.WriteLine(output.ToString());
            string ret = output.ToString();

            //Remove newline characters
            return(Regex.Replace(ret, @"\t|\n|\r", ""));
        }
Ejemplo n.º 7
0
        protected void Button1_Click(object sender, EventArgs e)
        {
            PdfReader pdfReader = new PdfReader("C:\\TESTE\\export.pdf");
            //
            // Summary:
            //     Constructs a Rectangle-object.
            //
            // Parameters:
            //   llx:
            //     lower left x
            //
            //   lly:
            //     lower left y
            //
            //   urx:
            //     upper right x
            //
            //   ury:
            //     upper right y
            Rectangle               rect         = new Rectangle(800, 700, 0, 500);
            RenderFilter            regionFilter = new RegionTextRenderFilter(rect);
            ITextExtractionStrategy strategy     = new FilteredTextRenderListener(
                new LocationTextExtractionStrategy(), regionFilter);

            lblText.Text = PdfTextExtractor.GetTextFromPage(pdfReader, 1, strategy).ToString();
        }
Ejemplo n.º 8
0
        private bool TextIsInRectangle(PdfReader reader, String text, Rectangle rect)
        {
            FilteredTextRenderListener filterListener = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), new RegionTextRenderFilter(rect));

            String extractedText = PdfTextExtractor.GetTextFromPage(reader, 1, filterListener);

            return(extractedText.Equals(text));
        }
Ejemplo n.º 9
0
        public static string GetRectangle(PdfReader reader, int pageNumber, float llx, float lly, float urx, float ury)
        {
            iTextSharp.text.Rectangle rect         = new iTextSharp.text.Rectangle(llx, lly, urx, ury);
            RenderFilter[]            renderFilter = new RenderFilter[1];
            renderFilter[0] = new RegionTextRenderFilter(rect);
            ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter);

            return(PdfTextExtractor.GetTextFromPage(reader, pageNumber, textExtractionStrategy));
        }
Ejemplo n.º 10
0
        private string[] ExtractCurrentColumnFromPage(Rectangle column, PdfReader reader, int page)
        {
            var renderFilter      = new RegionTextRenderFilter(column);
            var renderFilterArray = new RenderFilter[] { renderFilter };

            var filteredTextRenderListener = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilterArray);
            var textFromColumn             = PdfTextExtractor.GetTextFromPage(reader, page, filteredTextRenderListener);

            return(SplitColumnTextIntoRows(textFromColumn));
        }
Ejemplo n.º 11
0
        public string GetStringValueFromRegion(string file, iTextSharp.text.Rectangle rectangle)
        {
            var reader       = new PdfReader(file);
            var renderFilter = new RenderFilter[1];

            renderFilter[0] = new RegionTextRenderFilter(rectangle);
            ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter);

            return(PdfTextExtractor.GetTextFromPage(reader, 1, textExtractionStrategy));
        }
Ejemplo n.º 12
0
        private static string GetStringFromFile(PdfReader reader, Rectangle rect, int pageNumber)
        {
            ITextExtractionStrategy strategy;

            RenderFilter[] filter = { new RegionTextRenderFilter(rect) };

            strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter);
            string result = PdfTextExtractor.GetTextFromPage(reader, pageNumber, strategy).Replace((char)10, ' ');

            return(result);
        }
Ejemplo n.º 13
0
        public static ITextExtractionStrategy MakeRectangle(float pixelDistanceFromLeft, float pixelDistanceFromBottom, float pixelDistanceWidth, float pixelDistanceHeight)
        {
            var rectangle = new System.util.RectangleJ(pixelDistanceFromLeft, pixelDistanceFromBottom, pixelDistanceWidth, pixelDistanceHeight);

            var filters = new RenderFilter[1];

            filters[0] = new RegionTextRenderFilter(rectangle);

            ITextExtractionStrategy strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filters);

            return(strategy);
        }
Ejemplo n.º 14
0
        private static string GetColumnText(PdfReader reader, int pageNum, float llx, float lly, float urx, float ury)
        {
            // reminder, parameters are in points, and 1 in = 2.54 cm = 72 points
            var rect         = new iTextSharp.text.Rectangle(llx, lly, urx, ury);
            var renderFilter = new RenderFilter[1];

            renderFilter[0] = new RegionTextRenderFilter(rect);
            var textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter);
            var text = PdfTextExtractor.GetTextFromPage(reader, pageNum, textExtractionStrategy);

            return(text);
        }
Ejemplo n.º 15
0
        public List <Line> getTextFromRectangle(int x, int y, int w, int h)
        {
            System.util.RectangleJ  rect0  = new System.util.RectangleJ(x, y, w, h);
            RenderFilter[]          filter = { new RegionTextRenderFilter(rect0) };
            ITextExtractionStrategy strategy;
            StringBuilder           sb = new StringBuilder();

            strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter);
            sb.AppendLine(PdfTextExtractor.GetTextFromPage(reader, 1, strategy));

            List <Line> line = stockLine(sb.ToString());

            return(line);
        }
Ejemplo n.º 16
0
        private void ReadText()
        {
            //List<string> linestringlist = new List<string>();
            //PdfReader reader = new PdfReader(pdfFileName);
            iTextSharp.text.Rectangle rectA        = new iTextSharp.text.Rectangle(coordinate1a, coordinate2a, coordinate3a, coordinate4a);
            iTextSharp.text.Rectangle rectB        = new iTextSharp.text.Rectangle(coordinate1b, coordinate2b, coordinate3b, coordinate4b);
            RenderFilter[]            renderFilter = new RenderFilter[2];
            renderFilter[0] = new RegionTextRenderFilter(rectA);
            renderFilter[1] = new RegionTextRenderFilter(rectB);
            ITextExtractionStrategy textExtractionStrategyA = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter[0]);
            ITextExtractionStrategy textExtractionStrategyB = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter[1]);

            textA = PdfTextExtractor.GetTextFromPage(reader, 1, textExtractionStrategyA);
            textB = PdfTextExtractor.GetTextFromPage(reader, 1, textExtractionStrategyB);
        }
Ejemplo n.º 17
0
        public static void ExtractIDs()
        {
            PdfReader               reader = new PdfReader($@"{jobDir}\PDF Extraction\temp\{sourcePDF}");
            FileStream              fs     = new FileStream($@"{jobDir}\PDF Extraction\temp\extractedIDs.txt", FileMode.Create);
            StreamWriter            sw     = new StreamWriter(fs);
            PdfReaderContentParser  parser = new PdfReaderContentParser(reader);
            ITextExtractionStrategy strategy;
            TextMarginFinder        finder;
            string previousVal = "";
            string currentVal  = "";
            int    count       = 0;
            string pages       = "";

            sw.WriteLine("Index\tID\tPageCounter\tPageNumber\tFileName");

            for (int i = 1; i <= reader.NumberOfPages; i++)
            {
                try
                {
                    // finder = parser.ProcessContent(i, new TextMarginFinder());
                    //Rectangle area = new Rectangle(finder.GetLlx(), finder.GetLly(), finder.GetWidth() / 2, finder.GetHeight() / 2);
                    Rectangle    area   = new Rectangle(414, 660, 522, 689);
                    RenderFilter filter = new RegionTextRenderFilter(area);
                    strategy   = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter);
                    currentVal = PdfTextExtractor.GetTextFromPage(reader, i, strategy);

                    Rectangle    area2   = new Rectangle(465, 565, 555, 635);
                    RenderFilter filter2 = new RegionTextRenderFilter(area2);
                    strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter2);
                    pages    = PdfTextExtractor.GetTextFromPage(reader, i, strategy);


                    if (previousVal != currentVal)
                    {
                        count = 0;
                    }
                    count++;
                    previousVal = currentVal;
                    sw.WriteLine($"{i}\t{currentVal}\t{pages.Split('\n')[0]}\t{count}\t{currentVal}-{count}");
                }
                catch (Exception)
                {
                    sw.WriteLine($"{i}\tfailed");
                }
            }
            sw.Flush();
            sw.Close();
        }
Ejemplo n.º 18
0
        /// <summary>
        /// Parse a Pdf file in two part.
        /// Every part will be written in an existing text file (which in my case is on the desktop)
        /// </summary>
        /// <param name="args"></param>
        static void Main(string[] args)
        {
            const string filePath = @"c:\Users\sofiane\Desktop\25_1.pdf";
            const string outPath  = @"c:\Users\sofiane\Desktop\test.txt";

            //The following line is to prevent a warning when reading more than one pdf page. On the second page you'll get an encoding error otherwise

            System.Text.Encoding.RegisterProvider(System.Text.CodePagesEncodingProvider.Instance);
            PdfReader reader  = new PdfReader(filePath);
            string    strText = string.Empty;

            HashSet <String> names = new HashSet <string>();
            var pdfDictionary      = new PdfDictionary();

            //ITextExtractionStrategy its = new iTextSharp.text.pdf.parser.LocationTextExtractionStrategy() if you want to extract info everywhere;

            for (int page = 1; page <= reader.NumberOfPages; page++)
            {
                PdfDictionary dic       = reader.GetPageN(page);
                PdfDictionary resources = dic.GetAsDict(PdfName.RESOURCES);
                if (resources != null)
                {
                    //get fonts dico
                    PdfDictionary fonts = resources.GetAsDict(PdfName.FONT);
                    if (fonts != null)
                    {
                        PdfDictionary font;
                        foreach (PdfName key in fonts.Keys)
                        {
                            font = fonts.GetAsDict(key);
                            string name = font.GetAsName(PdfName.BASEFONT).ToString();
                            Console.WriteLine(name);
                            names.Add(name);
                        }
                    }
                }
                //ITextExtractionStrategy its = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter);
                RenderFilter[]          filter = get_render(page, 0);
                ITextExtractionStrategy its    = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter);
                WriteInfile(reader, page, its, outPath);

                leaRenderFilter[]       filter2 = get_render(page, 1);
                ITextExtractionStrategy itsL    = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter2);
                WriteInfile(reader, page, itsL, outPath);
            }

            reader.Close();
        }
        /// <summary>
        /// Extracts text from PDF file. (only selectable text content from the pages, not OCR from images)
        /// </summary>
        /// <param name="filepath">input file path</param>
        /// <param name="zone">Rectangle which specifies the zone where the text is extracted from a page. if it's null, then the full page is processed.</param>
        /// <param name="pages">List of pages to extract data from. If null or first item is 0, all pages will be extracted.</param>
        /// <returns>a list of strings. one string from each page</returns>
        public static List <string> GetPdfTextFromPages(string filepath, RectangleF?zone = null, List <int> pages = null)
        {
            using (PdfReader reader = new PdfReader(filepath))
            {
                List <string> result = new List <string>();

                if (pages == null || pages.First() == 0)                        //then read all pages
                {
                    pages = Enumerable.Range(1, reader.NumberOfPages).ToList(); //create the list of all pagenumbers in the actual PDF
                }

                foreach (var i in pages)
                {
                    if (i > reader.NumberOfPages)
                    {
                        continue;
                    }

                    if (zone.HasValue)
                    { //zone based text extract
                        float x = Utilities.MillimetersToPoints(zone.Value.X);
                        float y = Utilities.MillimetersToPoints(zone.Value.Y);
                        float w = Utilities.MillimetersToPoints(zone.Value.Width);
                        float h = Utilities.MillimetersToPoints(zone.Value.Height);

                        var pagesize = reader.GetPageSizeWithRotation(i);
                        iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(x, pagesize.Top - y, x + w, pagesize.Top - y - h); //tanslate coordinates to iText

                        RenderFilter[] renderFilter = new RenderFilter[1];
                        renderFilter[0] = new RegionTextRenderFilter(rect);
                        ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter);
                        string text = PdfTextExtractor.GetTextFromPage(reader, i, textExtractionStrategy);
                        result.Add(text.Replace("\n", Environment.NewLine));
                    }
                    else
                    { //full page text extract
                        string text = PdfTextExtractor.GetTextFromPage(reader, i);
                        result.Add(text.Replace("\n", Environment.NewLine));
                    }
                }



                reader.Close();
                return(result);
            }
        }
Ejemplo n.º 20
0
    void TakeDataFromPdf(UnityWebRequest www)
    {
        MemoryStream mm = new MemoryStream(www.downloadHandler.data);

        bytes = mm.ToArray();
        PdfReader reader = new PdfReader(mm);
        ITextExtractionStrategy strategy;

        for (int q = 0; q < 6; q++)
        {
            for (int j = 0; j < 8; j++)
            {
                Rectangle    rect   = new Rectangle(45 + (j * Adeltax), 40 + (q * Adeltay), 138 + (j * Bdeltax), 130 + (q * Bdeltay));
                RenderFilter filter = new RegionTextRenderFilter(rect);
                string       s;

                strategy = new FilteredTextRenderListener(new SimpleTextExtractionStrategy(), filter);
                s        = "";
                s        = PdfTextExtractor.GetTextFromPage(reader, 1, strategy);

                //step1_____________________________________

                if (IsLaba(s))
                {
                    float microBdeltaX = 0;

                    if (j == 0)
                    {
                        microBdeltaX = Bdeltax;
                    }

                    Bdeltax *= 2;
                    rect     = new Rectangle(45 + (j * Adeltax), 40 + (q * Adeltay), 138 + (j * Bdeltax) + microBdeltaX, 130 + (q * Bdeltay));
                    filter   = new RegionTextRenderFilter(rect);
                    strategy = new FilteredTextRenderListener(new SimpleTextExtractionStrategy(), filter);
                    s        = PdfTextExtractor.GetTextFromPage(reader, 1, strategy);

                    j++;
                    Bdeltax /= 2;
                }
                nakedData.Add(s);
            }
        }

        mm.Close();
    }
Ejemplo n.º 21
0
        public string ExtractByCoordinate()
        {
            ITextExtractionStrategy strategy;

            Rectangle rectangle = new Rectangle(320, 785 - 250, 368, 799 - 250);
            // Rectangle rectangle = new Rectangle(447, 934-250, 678, 951); -> Ok
            RenderFilter filter = new RegionTextRenderFilter(rectangle);

            strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter);

            for (int page = 1; page <= pdfReader.NumberOfPages; page++)
            {
                Console.WriteLine(PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy));
            }

            return(null);
        }
Ejemplo n.º 22
0
        public string ExtractData(float UIWith, float UIHeight, Point ll, Point ur,
                                  int page = 1)
        {
            Console.WriteLine("Test");
            float MultX = PageSize().Width / UIWith;
            float MultY = PageSize().Height / UIHeight;

            ITextExtractionStrategy strategy;

            Rectangle rectangle = new Rectangle(ll.X * MultX, (UIHeight - ll.Y) * MultY, ur.X * MultX, (UIHeight - ur.Y) * MultY);
            // Rectangle rectangle = new Rectangle(447, 934-250, 678, 951); // -> Ok
            RenderFilter filter = new RegionTextRenderFilter(rectangle);

            strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter);
            // return "" + UIWith + " " + UIHeight + " " + ll.X + " " + ll.Y + " " + ur.X + " " + ur.Y;

            return(PdfTextExtractor.GetTextFromPage(PdfReader, page, strategy));
        }
Ejemplo n.º 23
0
 public static string ReadID(string fileName)
 {
     try
     {
         PdfReader reader = new PdfReader(fileName);
         PdfReaderContentParser  parser = new PdfReaderContentParser(reader);
         ITextExtractionStrategy strategy;
         //TextMarginFinder finder;
         //finder = parser.ProcessContent(1, new TextMarginFinder());
         Rectangle    area   = new Rectangle(414, 660, 522, 689);
         RenderFilter filter = new RegionTextRenderFilter(area);
         strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter);
         return(PdfTextExtractor.GetTextFromPage(reader, 1, strategy));
     }
     catch (Exception)
     {
         return("unreadable");
     }
 }
Ejemplo n.º 24
0
        public string GetTextInArea(string filename, int page, int x, int y, int width, int height)
        {
            string text = string.Empty;

            using (PdfReader pdfReader = new PdfReader(filename))
            {
                RenderFilter filter   = new RegionTextRenderFilter(new System.util.RectangleJ(x, y, width, height));
                var          strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter);
                //var strategy = new SimpleTextExtractionStrategy();

                text = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);

                text = Encoding.UTF8.GetString(ASCIIEncoding.Convert(
                                                   Encoding.Default, Encoding.UTF8,
                                                   Encoding.Default.GetBytes(text)));
            }

            return(text);
        }
Ejemplo n.º 25
0
        public string ReadFromPosistionIText()
        {
            RectangleJ rect = new RectangleJ(0, 0, 2000, 1800);

            RenderFilter[]          filter = { new RegionTextRenderFilter(rect) };
            ITextExtractionStrategy strategy;
            StringBuilder           sb = new StringBuilder();

            for (int i = 1; i <= Document.NumberOfPages; i++)
            {
                strategy = new FilteredTextRenderListener(
                    new LocationTextExtractionStrategy(), filter
                    );
                sb.AppendLine(
                    PdfTextExtractor.GetTextFromPage(Document, i, strategy)
                    );
            }

            return(sb.ToString());
        }
Ejemplo n.º 26
0
 public override void ParseTag(String tag, PdfObject obj, PdfDictionary page)
 {
     if (obj is PdfNumber)
     {
         PdfNumber                  mcid     = (PdfNumber)obj;
         RenderFilter               filter   = new AcroFieldsFlattenTest.MyMarkedContentRenderFilter(mcid.IntValue);
         ITextExtractionStrategy    strategy = new SimpleTextExtractionStrategy();
         FilteredTextRenderListener listener = new FilteredTextRenderListener(
             strategy, filter);
         PdfContentStreamProcessor processor = new PdfContentStreamProcessor(
             listener);
         processor.ProcessContent(PdfReader.GetPageContent(page), page
                                  .GetAsDict(PdfName.RESOURCES));
         outp.Write(XMLUtil.EscapeXML(listener.GetResultantText(), true));
     }
     else
     {
         base.ParseTag(tag, obj, page);
     }
 }
Ejemplo n.º 27
0
        public static string ReadDATAFromPDF(string fileName, string path)
        {
            StringBuilder sb = new StringBuilder();

            if (File.Exists(fileName))
            {
                PdfReader reader = new PdfReader(fileName);
                Rectangle rect   = new Rectangle(0, 0, 415, 775);

                RenderFilter[]          filter = { new RegionTextRenderFilter(rect) };
                ITextExtractionStrategy strategy;

                for (int i = 1; i <= reader.NumberOfPages; i++)
                {
                    strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter);
                    sb.AppendLine(PdfTextExtractor.GetTextFromPage(reader, i, strategy));
                }
            }
            MakeTempFile(sb, path);
            return(sb.ToString());
        }
Ejemplo n.º 28
0
        /// <summary>
        /// Retorna os campos do PDF por posição faz um retangulo
        /// </summary>
        /// <param name="CaminhoArquivo">Caminho do arqivo a ser lido</param>
        /// <param name="Posicoes">Array com 3 posições</param>
        /// <param name="Pagina">Número da página aonde se encontra os dados do cliente</param>
        /// <returns>Retorna o valor encontrado</returns>
        private string RetornarValor(String CaminhoArquivo, string[] Posicoes, int Pagina)
        {
            string retorno = string.Empty;

            using (PdfReader pdfReader = new PdfReader(CaminhoArquivo))
            {
                RectangleJ     rect         = new RectangleJ(float.Parse(Posicoes[0]), float.Parse(Posicoes[1]), float.Parse(Posicoes[2]), float.Parse(Posicoes[3]));
                RenderFilter[] renderFilter = { new RegionTextRenderFilter(rect) };

                ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter);
                retorno = PdfTextExtractor.GetTextFromPage(pdfReader, Pagina, textExtractionStrategy);

                rect                   = null;
                renderFilter           = null;
                textExtractionStrategy = null;

                pdfReader.Close();
                pdfReader.Dispose();
            }

            return(retorno == string.Empty ? " " : retorno);
        }
Ejemplo n.º 29
0
        public string getParagraphByCoOrdinate(string filepath, int pageno, int cordinate1, int coordinate2, int coordinate3, int coordinate4, bool filter)
        {
            PdfReader reader = new PdfReader(filepath);

            if (filter == false)
            {
                iTextSharp.text.Rectangle rect         = new iTextSharp.text.Rectangle(cordinate1, 1000 - coordinate2, coordinate3, 1000 - coordinate4);
                RenderFilter[]            renderFilter = new RenderFilter[1];
                renderFilter[0] = new RegionTextRenderFilter(rect);
                ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter);
                string text = PdfTextExtractor.GetTextFromPage(reader, pageno, textExtractionStrategy);
                return(text);
            }
            else
            {
                iTextSharp.text.Rectangle rect         = new iTextSharp.text.Rectangle(cordinate1, coordinate2, coordinate3, coordinate4);
                RenderFilter[]            renderFilter = new RenderFilter[1];
                renderFilter[0] = new RegionTextRenderFilter(rect);
                ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter);
                string text = PdfTextExtractor.GetTextFromPage(reader, pageno, textExtractionStrategy);
                return(text);
            }
        }
Ejemplo n.º 30
0
// ---------------------------------------------------------------------------
        public void Write(Stream stream)
        {
            using (ZipFile zip = new ZipFile()) {
                zip.AddFile(PREFACE, "");
                PdfReader reader            = new PdfReader(PREFACE);
                System.util.RectangleJ rect = new System.util.RectangleJ(
                    70, 80, 420, 500
                    );
                RenderFilter[]          filter = { new RegionTextRenderFilter(rect) };
                ITextExtractionStrategy strategy;
                StringBuilder           sb = new StringBuilder();
                for (int i = 1; i <= reader.NumberOfPages; i++)
                {
                    strategy = new FilteredTextRenderListener(
                        new LocationTextExtractionStrategy(), filter
                        );
                    sb.AppendLine(
                        PdfTextExtractor.GetTextFromPage(reader, i, strategy)
                        );
                }
                zip.AddEntry(RESULT, sb.ToString());
                zip.Save(stream);
            }
        }