Exemplo n.º 1
0
        public virtual void Test02()
        {
            // Again not completely correct. see test04()
            //TODO DEVSIX-2648
            PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "in02.pdf"));

            String[]    expectedText = new String[] { "1879 ", "\u05D4\u05D0\u05D5\u05E4\u05E0\u05D5\u05E2", ")\u05D2\u05D5\u05D8\u05DC\u05D9\u05D1 \u05D3\u05D9\u05D9\u05DE\u05DC\u05E8 \u05D5\u05D5\u05D9\u05DC\u05D4\u05DC\u05DD \u05DE\u05D9\u05D9\u05D1\u05D0\u05DA,1885 (," };
            Rectangle[] regions      = new Rectangle[] { new Rectangle(493, 765, 23, 11), new Rectangle(522, 784, 38, 12),
                                                         new Rectangle(332, 784, 185, 12) };
            TextRegionEventFilter[] regionFilters = new TextRegionEventFilter[regions.Length];
            for (int i = 0; i < regions.Length; i++)
            {
                regionFilters[i] = new TextRegionEventFilter(regions[i]);
            }
            FilteredEventListener listener = new FilteredEventListener();

            LocationTextExtractionStrategy[] extractionStrategies = new LocationTextExtractionStrategy[regions.Length]
            ;
            for (int i = 0; i < regions.Length; i++)
            {
                extractionStrategies[i] = listener.AttachEventListener(new LocationTextExtractionStrategy().SetRightToLeftRunDirection
                                                                           (true), regionFilters[i]);
            }
            new PdfCanvasProcessor(listener).ProcessPageContent(pdfDocument.GetPage(1));
            for (int i = 0; i < regions.Length; i++)
            {
                String actualText = extractionStrategies[i].GetResultantText();
                NUnit.Framework.Assert.AreEqual(expectedText[i], actualText);
            }
        }
        public virtual void TestWithMultiFilteredRenderListener()
        {
            PdfDocument           pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "test.pdf"));
            float                 x1;
            float                 y1;
            float                 x2;
            float                 y2;
            FilteredEventListener listener = new FilteredEventListener();

            x1 = 122;
            x2 = 22;
            y1 = 678.9f;
            y2 = 12;
            ITextExtractionStrategy region1Listener = listener.AttachEventListener(new LocationTextExtractionStrategy(
                                                                                       ), new TextRegionEventFilter(new Rectangle(x1, y1, x2, y2)));

            x1 = 156;
            x2 = 13;
            y1 = 678.9f;
            y2 = 12;
            ITextExtractionStrategy region2Listener = listener.AttachEventListener(new LocationTextExtractionStrategy(
                                                                                       ), new TextRegionEventFilter(new Rectangle(x1, y1, x2, y2)));
            PdfCanvasProcessor parser = new PdfCanvasProcessor(new GlyphEventListener(listener));

            parser.ProcessPageContent(pdfDocument.GetPage(1));
            NUnit.Framework.Assert.AreEqual("Your", region1Listener.GetResultantText());
            NUnit.Framework.Assert.AreEqual("dju", region2Listener.GetResultantText());
        }
Exemplo n.º 3
0
        public virtual void Test01()
        {
            PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "test01.pdf"));

            String[] expectedText = new String[] { "\u0928\u093F\u0930\u094D\u0935\u093E\u091A\u0915", "\u0928\u0917\u0930\u0928\u093F\u0917\u092E / "
                                                   + "\u0928\u0917\u0930\u092A\u0930\u093F\u0937\u0926" + " / \u0928\u0917\u0930\u092A\u093E\u0932\u093F\u0915\u093E \u0915\u093E \u0928\u093E\u092E"
                                                   , "\u0935 " + "\u0938\u0902\u0916\u094D\u092F\u093E", "\u0938\u0902\u0915\u094D\u0937\u093F\u092A\u094D\u0924 \u092A\u0941\u0928\u0930\u0940\u0915\u094D\u0937\u0923"
                                                   , "\u092E\u0924\u0926\u093E\u0928 " + "\u0915\u0947\u0928\u094D\u0926\u094D\u0930" + "\u0915\u093E", "\u0906\u0930\u0902\u092D\u093F\u0915 "
                                                   + "\u0915\u094D\u0930\u092E\u0938\u0902\u0916\u094D\u092F\u093E" };
            Rectangle[] regions = new Rectangle[] { new Rectangle(30, 779, 45, 20), new Rectangle(30, 745, 210, 20), new
                                                    Rectangle(30, 713, 42, 20), new Rectangle(30, 679, 80, 20), new Rectangle(30, 647, 73, 20), new Rectangle
                                                        (30, 612, 93, 20) };
            TextRegionEventFilter[] regionFilters = new TextRegionEventFilter[regions.Length];
            for (int i = 0; i < regions.Length; i++)
            {
                regionFilters[i] = new TextRegionEventFilter(regions[i]);
            }
            FilteredEventListener listener = new FilteredEventListener();

            LocationTextExtractionStrategy[] extractionStrategies = new LocationTextExtractionStrategy[regions.Length]
            ;
            for (int i = 0; i < regions.Length; i++)
            {
                extractionStrategies[i] = listener.AttachEventListener(new LocationTextExtractionStrategy().SetUseActualText
                                                                           (true), regionFilters[i]);
            }
            new PdfCanvasProcessor(listener).ProcessPageContent(pdfDocument.GetPage(1));
            for (int i = 0; i < regions.Length; i++)
            {
                String actualText = extractionStrategies[i].GetResultantText();
                NUnit.Framework.Assert.AreEqual(expectedText[i], actualText);
            }
        }
Exemplo n.º 4
0
        public virtual void ManipulatePdf(String dest)
        {
            PdfDocument pdfDoc = new PdfDocument(new PdfReader(SRC));

            Rectangle             rect       = new Rectangle(36, 750, 523, 56);
            CustomFontFilter      fontFilter = new CustomFontFilter(rect);
            FilteredEventListener listener   = new FilteredEventListener();

            // Create a text extraction renderer
            LocationTextExtractionStrategy extractionStrategy = listener
                                                                .AttachEventListener(new LocationTextExtractionStrategy(), fontFilter);

            // Note: If you want to re-use the PdfCanvasProcessor, you must call PdfCanvasProcessor.reset()
            new PdfCanvasProcessor(listener).ProcessPageContent(pdfDoc.GetFirstPage());

            // Get the resultant text after applying the custom filter
            String actualText = extractionStrategy.GetResultantText();

            pdfDoc.Close();

            // See the resultant text in the console
            Console.Out.WriteLine(actualText);

            using (StreamWriter writer = new StreamWriter(dest))
            {
                writer.Write(actualText);
            }
        }
Exemplo n.º 5
0
        public virtual void Test()
        {
            PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "test.pdf"));

            String[] expectedText = new String[] { "PostScript Compatibility", "Because the PostScript language does not support the transparent imaging \n"
                                                   + "model, PDF 1.4 consumer applications must have some means for converting the \n" + "appearance of a document that uses transparency to a purely opaque description \n"
                                                   + "for printing on PostScript output devices. Similar techniques can also be used to \n" + "convert such documents to a form that can be correctly viewed by PDF 1.3 and \n"
                                                   + "earlier consumers. ", "Otherwise, flatten the colors to some assumed device color space with pre-\n"
                                                   + "determined calibration. In the generated PostScript output, paint the flattened \n" + "colors in a CIE-based color space having that calibration. " };
            Rectangle[] regions = new Rectangle[] { new Rectangle(90, 581, 130, 24), new Rectangle(80, 486, 370, 92),
                                                    new Rectangle(103, 143, 357, 53) };
            TextRegionEventFilter[] regionFilters = new TextRegionEventFilter[regions.Length];
            for (int i = 0; i < regions.Length; i++)
            {
                regionFilters[i] = new TextRegionEventFilter(regions[i]);
            }
            FilteredEventListener listener = new FilteredEventListener();

            LocationTextExtractionStrategy[] extractionStrategies = new LocationTextExtractionStrategy[regions.Length]
            ;
            for (int i = 0; i < regions.Length; i++)
            {
                extractionStrategies[i] = listener.AttachEventListener(new LocationTextExtractionStrategy(), regionFilters
                                                                       [i]);
            }
            new PdfCanvasProcessor(listener).ProcessPageContent(pdfDocument.GetPage(1));
            for (int i = 0; i < regions.Length; i++)
            {
                String actualText = extractionStrategies[i].GetResultantText();
                NUnit.Framework.Assert.AreEqual(expectedText[i], actualText);
            }
        }
Exemplo n.º 6
0
        public Task <File> Convert(string path)
        {
            return(Task.Run(() =>
            {
                var file = new File
                {
                    Path = path,
                    Mime = "application/pdf"
                };

                using (var document = new PdfDocument(new PdfReader(path)))
                {
                    int numOfPages = document.GetNumberOfPages();

                    var listener = new FilteredEventListener();
                    var extractionStrategy = listener
                                             .AttachEventListener(new LocationTextExtractionStrategy());

                    var processor = new PdfCanvasProcessor(listener);
                    var content = new StringBuilder();

                    for (int i = 1; i <= numOfPages; i++)
                    {
                        processor.ProcessPageContent(document.GetPage(i));
                        content.Append(extractionStrategy.GetResultantText());

                        processor.Reset();
                    }

                    file.Content = content.ToString();
                }

                return file;
            }));
        }
Exemplo n.º 7
0
        public static void ExtractPhysicalAddress()
        {
            var di = new DirectoryInfo(@"c:\temp\ime");

            foreach (var file in di.GetFiles("*.pdf"))
            {
                PdfDocument pdfDoc = new PdfDocument(new PdfReader(file.FullName));

                Rectangle             rect         = new Rectangle(300, 470, 70, 150);
                TextRegionEventFilter regionFilter = new TextRegionEventFilter(rect);

                FilteredEventListener listener = new FilteredEventListener();

                LocationTextExtractionStrategy extractionStrategy = listener
                                                                    .AttachEventListener(new LocationTextExtractionStrategy(), regionFilter);


                new PdfCanvasProcessor(listener).ProcessPageContent(pdfDoc.GetPage(2));


                String actualText = extractionStrategy.GetResultantText();

                pdfDoc.Close();

                Console.WriteLine(file.Name);
                Console.WriteLine(actualText);

                using (StreamWriter writer = new StreamWriter(file.FullName.Replace(".pdf", ".txt")))
                {
                    writer.Write(actualText);
                }
            }
        }
Exemplo n.º 8
0
        /// <summary>
        /// Parsing data from Oy axis
        /// </summary>
        /// <param name="page">Data of page</param>
        /// <returns>data of Oy axis</returns>
        internal StringBuilder ParsingOyAxis(PdfPage page)
        {
            // temp variable
            Rectangle                      readBox;
            TextRegionEventFilter          readText;
            FilteredEventListener          listener;
            LocationTextExtractionStrategy extractor;
            PdfCanvasProcessor             parser;

            string[]      lines;
            StringBuilder result = new StringBuilder();

            // area limit for read
            readBox = new Rectangle(Margin.Left, Margin.Bottom + 60, 20,
                                    page.GetPageSize().GetHeight() - Margin.Bottom - 160);

            readText = new TextRegionEventFilter(readBox);
            listener = new FilteredEventListener();

            // create a text extraction renderer
            extractor = listener
                        .AttachEventListener(new LocationTextExtractionStrategy(),
                                             readText);

            lock (block)
            {
                (parser = new PdfCanvasProcessor(listener))
                .ProcessPageContent(page);
                parser.Reset();
            }

            // read every line (row)
            lines = extractor
                    .GetResultantText()
                    .Split('\n');

            foreach (string line in lines)
            {
                if (!string.IsNullOrEmpty(line.Trim()))
                {
                    result.AppendLine(line);
                }
            }

            TextExtractionStrategy strategy =
                listener.AttachEventListener(new TextExtractionStrategy(), readText);

            lock (block)
            {
                (parser = new PdfCanvasProcessor(listener))
                .ProcessPageContent(page);
                parser.Reset();
            }

            PositionOyAxis = strategy.TextResult.ToArray();

            return(result);
        }
Exemplo n.º 9
0
        /// <summary>
        /// Extract The Text Content From Pdf
        /// </summary>
        /// <param name="sourcePdf">The Pdf in base64 string format</param>
        /// <returns>A string of lines separated by \n </returns>
        public static List <String> ExtractTextPaged(byte[] sourcePdf)
        {
            List <String> _textExtractedPaged = new List <string>();

            using (PdfDocument _sourcePdfDocument = new PdfDocument(new PdfReader(new MemoryStream(sourcePdf))))
            {
                for (int i = 1; i < _sourcePdfDocument.GetNumberOfPages(); i++)
                {
                    FilteredEventListener        listener           = new FilteredEventListener();
                    SimpleTextExtractionStrategy extractionStrategy = listener.AttachEventListener(new SimpleTextExtractionStrategy());
                    new PdfCanvasProcessor(listener).ProcessPageContent(_sourcePdfDocument.GetPage(i));
                    _textExtractedPaged.Add(extractionStrategy.GetResultantText());
                }
            }

            return(_textExtractedPaged);
        }
Exemplo n.º 10
0
        /// <summary>
        /// Parsing columns name
        /// </summary>
        /// <param name="page">Data of page</param>
        /// <returns>names of columns from page</returns>
        internal StringBuilder ParsingColumns(PdfPage page)
        {
            // temp variable
            Rectangle                      readBox;
            TextRegionEventFilter          readText;
            FilteredEventListener          listener;
            LocationTextExtractionStrategy extractor;
            PdfCanvasProcessor             parser;

            string[]      lines;
            StringBuilder result = new StringBuilder();

            // area limit for read
            readBox = new Rectangle(Margin.Left,
                                    page.GetPageSize().GetHeight() - Margin.Top - 70,
                                    (page.GetPageSize().GetWidth() - Margin.Right) / 4, 10);
            readText = new TextRegionEventFilter(readBox);
            listener = new FilteredEventListener();

            // create a text extraction renderer
            extractor = listener
                        .AttachEventListener(new LocationTextExtractionStrategy(),
                                             readText);

            lock (block)
            {
                (parser = new PdfCanvasProcessor(listener))
                .ProcessPageContent(page);
                parser.Reset();
            }

            // read every line (row)
            lines = extractor.GetResultantText()
                    .Split('\n');

            foreach (string line in lines)
            {
                result.AppendLine(line);
            }

            return(result);
        }
Exemplo n.º 11
0
        public static string ManipulatePdf(string filePath)
        {
            PdfDocument pdfDoc = new PdfDocument(new PdfReader(filePath));

            //CustomFontFilter fontFilter = new CustomFontFilter(rect);
            FilteredEventListener listener = new FilteredEventListener();

            // Create a text extraction renderer
            LocationTextExtractionStrategy extractionStrategy = listener
                                                                .AttachEventListener(new LocationTextExtractionStrategy());

            // Note: If you want to re-use the PdfCanvasProcessor, you must call PdfCanvasProcessor.reset()
            new PdfCanvasProcessor(listener).ProcessPageContent(pdfDoc.GetFirstPage());

            // Get the resultant text after applying the custom filter
            String actualText = extractionStrategy.GetResultantText();

            pdfDoc.Close();

            return(actualText);
        }
Exemplo n.º 12
0
        public virtual void MultipleFiltersForOneRegionTest()
        {
            PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "test.pdf"));

            Rectangle[] regions = new Rectangle[] { new Rectangle(0, 0, 500, 650), new Rectangle(0, 0, 400, 400), new
                                                    Rectangle(200, 200, 300, 400), new Rectangle(100, 100, 350, 300) };
            TextRegionEventFilter[] regionFilters = new TextRegionEventFilter[regions.Length];
            for (int i = 0; i < regions.Length; i++)
            {
                regionFilters[i] = new TextRegionEventFilter(regions[i]);
            }
            FilteredEventListener          listener           = new FilteredEventListener();
            LocationTextExtractionStrategy extractionStrategy = listener.AttachEventListener(new LocationTextExtractionStrategy
                                                                                                 (), regionFilters);

            new PdfCanvasProcessor(listener).ProcessPageContent(pdfDocument.GetPage(1));
            String actualText   = extractionStrategy.GetResultantText();
            String expectedText = PdfTextExtractor.GetTextFromPage(pdfDocument.GetPage(1), new FilteredTextEventListener
                                                                       (new LocationTextExtractionStrategy(), regionFilters));

            NUnit.Framework.Assert.AreEqual(expectedText, actualText);
        }
Exemplo n.º 13
0
        public Bitmap ConvertToBitmap(PdfPage pdfPage)
        {
            var rotation = pdfPage.GetRotation();

            var chunkDictionairy = new SortedDictionary <float, IChunk>();

            FilteredEventListener listener = new FilteredEventListener();

            listener.AttachEventListener(new TextListener(chunkDictionairy, IncreaseCounter));
            listener.AttachEventListener(new ImageListener(chunkDictionairy, IncreaseCounter));
            PdfCanvasProcessor processor = new PdfCanvasProcessor(listener);

            processor.ProcessPageContent(pdfPage);

            //var size = currentPage.GetPageSizeWithRotation();
            var size = pdfPage.GetPageSize();

            var width  = size.GetWidth().PointsToPixels();
            var height = size.GetHeight().PointsToPixels();

            Bitmap bmp = new Bitmap(width, height);

            bmp.SetResolution(MeasuringExtensions.Dpi, MeasuringExtensions.Dpi);
            using (Graphics g = Graphics.FromImage(bmp))
            {
                g.FillRectangle(Brushes.White, 0, 0, bmp.Width, bmp.Height);

                g.SmoothingMode     = SmoothingMode.AntiAlias;
                g.InterpolationMode = InterpolationMode.HighQualityBicubic;
                g.PixelOffsetMode   = PixelOffsetMode.HighQuality;
                g.TextRenderingHint = TextRenderingHint.SingleBitPerPixelGridFit;

                foreach (var chunk in chunkDictionairy)
                {
                    g.ResetTransform();

                    g.RotateTransform(-rotation);

                    if (chunk.Value is Models.ImageChunk imageChunk)
                    {
                        var imgW = imageChunk.W.PointsToPixels();
                        var imgH = imageChunk.H.PointsToPixels();
                        var imgX = imageChunk.X.PointsToPixels();
                        var imgY = (size.GetHeight() - imageChunk.Y - imageChunk.H).PointsToPixels();

                        g.TranslateTransform(imgX, imgY, MatrixOrder.Append);
                        g.DrawImage(imageChunk.Image, 0, 0, imgW, imgH);
                        imageChunk.Image.Dispose();
                    }
                    else if (chunk.Value is Models.TextChunk textChunk)
                    {
                        var chunkX = textChunk.Rect.GetX().PointsToPixels();
                        var chunkY = bmp.Height - textChunk.Rect.GetY().PointsToPixels();

                        var fontSize = textChunk.FontSize.PointsToPixels();

                        Font font;
                        try
                        {
                            font = new Font(textChunk.FontFamily, fontSize, textChunk.FontStyle, GraphicsUnit.Pixel);
                        }
                        catch (Exception ex)
                        {
                            //log error

                            font = new Font("Calibri", 11, textChunk.FontStyle, GraphicsUnit.Pixel);
                        }

                        g.TranslateTransform(chunkX, chunkY, MatrixOrder.Append);

                        //g.DrawString(textChunk.Text, font, new SolidBrush(textChunk.Color), chunkX, chunkY);
                        g.DrawString(textChunk.Text, font, new SolidBrush(textChunk.Color), 0, 0);
                    }
                }

                g.Flush();
            }

            return(bmp);
        }
Exemplo n.º 14
0
        public Bitmap ConvertToBitmap(PdfPage pdfPage)
        {
            var rotation = pdfPage.GetRotation();

            //var size = currentPage.GetPageSizeWithRotation();
            var size   = pdfPage.GetPageSize();
            var width  = size.GetWidth().PointsToPixels();
            var height = size.GetHeight().PointsToPixels();

            var chunkDictionairy = new SortedDictionary <float, IChunk>();

            FilteredEventListener listener = new FilteredEventListener();

            listener.AttachEventListener(new TextListener(chunkDictionairy, IncreaseCounter));
            listener.AttachEventListener(new ImageListener(chunkDictionairy, IncreaseCounter));
            listener.AttachEventListener(new PathListener(chunkDictionairy, IncreaseCounter, size.GetHeight()));
            PdfCanvasProcessor processor = new PdfCanvasProcessor(listener);

            processor.ProcessPageContent(pdfPage);

            ////var size = currentPage.GetPageSizeWithRotation();
            //var size = pdfPage.GetPageSize();
            //var width = size.GetWidth().PointsToPixels();
            //var height = size.GetHeight().PointsToPixels();

            Bitmap bmp = new Bitmap(width, height);

            using (Graphics g = Graphics.FromImage(bmp))
            {
                g.FillRectangle(Brushes.White, 0, 0, bmp.Width, bmp.Height);

                g.SmoothingMode     = SmoothingMode.AntiAlias;
                g.InterpolationMode = InterpolationMode.HighQualityBicubic;
                g.PixelOffsetMode   = PixelOffsetMode.HighQuality;
                g.TextRenderingHint = TextRenderingHint.SingleBitPerPixelGridFit;

                foreach (var chunk in chunkDictionairy)
                {
                    g.ResetTransform();

                    g.RotateTransform(-rotation);

                    if (chunk.Value is Models.ImageChunk imageChunk)
                    {
                        var imgW = imageChunk.W.PointsToPixels();
                        var imgH = imageChunk.H.PointsToPixels();
                        var imgX = imageChunk.X.PointsToPixels();
                        var imgY = (size.GetHeight() - imageChunk.Y - imageChunk.H).PointsToPixels();

                        g.TranslateTransform(imgX, imgY, MatrixOrder.Append);
                        g.DrawImage(imageChunk.Image, 0, 0, imgW, imgH);
                        imageChunk.Image.Dispose();
                    }
                    else if (chunk.Value is Models.TextChunk textChunk)
                    {
                        //textChunk.Rect.GetHeight
                        var chunkX   = textChunk.Rect.GetX().PointsToPixels();
                        var chunkY   = bmp.Height - textChunk.Rect.GetY().PointsToPixels();
                        var fontSize = (textChunk.FontSize * textChunk.TextZoom).PointsToPixels();

                        Font font;
                        try
                        {
                            font = new Font(textChunk.FontFamily, fontSize, textChunk.FontStyle, GraphicsUnit.Pixel);
                        }
                        catch (Exception ex)
                        {
                            //log error

                            font = new Font("Calibri", 12, textChunk.FontStyle, GraphicsUnit.Pixel);
                        }

                        g.TranslateTransform(chunkX, chunkY, MatrixOrder.Append);

                        //g.DrawString(textChunk.Text, font, new SolidBrush(textChunk.Color), chunkX, chunkY);
                        g.DrawString(textChunk.Text, font, new SolidBrush(textChunk.Color), 0, 0);
                    }
                    else if (chunk.Value is Models.PathChunk pathChunk)
                    {
                        Trace.WriteLine("pathChunk pathChunk pathChunk");
                        Pen   newPen = new Pen(Color.Black);//定义一个画笔
                        float x1     = ((float)pathChunk.StartPath.x).PointsToPixels();
                        float y1     = ((float)pathChunk.StartPath.y).PointsToPixels();
                        //float y1 = pathChunk.StartPath.y < 0 ? bmp.Height + ((float)pathChunk.StartPath.y).PointsToPixels() : bmp.Height - ((float)pathChunk.StartPath.y).PointsToPixels();
                        float x2 = ((float)pathChunk.EndPath.x).PointsToPixels();
                        float y2 = ((float)pathChunk.EndPath.y).PointsToPixels();
                        //float y2 = pathChunk.EndPath.y < 0 ? bmp.Height + ((float)pathChunk.EndPath.y).PointsToPixels() : bmp.Height - ((float)pathChunk.EndPath.y).PointsToPixels();

                        g.DrawLine(newPen, x1, y1, x2, y2);//绘制直线
                        //g.DrawLine(newPen, ((float)pathChunk.StartPath.x).PointsToPixels(), bmp.Height - ((float)pathChunk.StartPath.y).PointsToPixels(), ((float)pathChunk.EndPath.x).PointsToPixels(), bmp.Height - ((float)pathChunk.EndPath.y).PointsToPixels());//绘制直线
                    }
                }

                g.Flush();
            }

            return(bmp);
        }
Exemplo n.º 15
0
        public static List <Dictionary <string, object> > FindMatches(PdfDocument pdf, string regex, int minPage = 0, int numPages = 0, int count = 0)
        {
            FilteredEventListener listener  = new FilteredEventListener();
            TextLocationStrategy  s         = listener.AttachEventListener(new TextLocationStrategy());
            PdfCanvasProcessor    processor = new PdfCanvasProcessor(listener);

            int maxNumPages = pdf.GetNumberOfPages();

            if (minPage <= 0 || minPage > maxNumPages)
            {
                minPage = 1;
            }
            if (numPages <= 0 || numPages > maxNumPages)
            {
                numPages = maxNumPages;
            }

            List <Dictionary <string, object> > list = new List <Dictionary <string, object> > ();

            for (int pageNum = minPage; maxNumPages != 0; maxNumPages--, pageNum++)
            {
                s.Reset();
                PdfPage page = pdf.GetPage(pageNum);
                processor.ProcessPageContent(page);
                s.Finish();

                foreach (TextLine x in s.GetLines())
                {
                    while (true)
                    {
                        Match m = Regex.Match(x.text, regex, RegexOptions.IgnoreCase);
                        if (!m.Success)
                        {
                            break;
                        }

                        int i = x.text.IndexOf(m.Value);
                        int n = m.Value.Length;

                        x.text = x.text.Substring(0, i) + new String('X', n) + x.text.Substring(i + n);

                        Rectangle r = x.GetRect(i, n);
                        if (r == null)
                        {
                            continue;
                        }

                        Dictionary <string, object> data = new Dictionary <string, object> ();
                        list.Add(data);

                        data["Page"]       = pageNum;
                        data["Text"]       = m.Value;
                        data["X"]          = r.GetLeft();
                        data["Y"]          = r.GetBottom();
                        data["Width"]      = r.GetWidth();
                        data["Height"]     = r.GetHeight();
                        data["FontSize"]   = x.chars[0].fontSize;
                        data["FontFamily"] = x.chars[0].fontFamily;

                        if (count != 0 && data.Count >= count)
                        {
                            return(list);
                        }
                    }
                }
            }

            return(list);
        }
Exemplo n.º 16
0
        public static void Main(string[] args)
        {
            Contract.Assert(args.Any(), "No arguments passed.");

            var path          = args.First();
            var extractedText = string.Empty;

            // monsters are on page 208 - 253

            using (var pdfReader = new PdfReader(path))
            {
                var          stringBuilder = new StringBuilder();
                var          pdfDocument   = new PdfDocument(pdfReader);
                var          listener      = new FilteredEventListener();
                CustomFilter customFilter  = null;
                SimpleTextExtractionStrategy extractionStrategy = null;
                PdfCanvasProcessor           pdfProcessor       = null;

                for (int i = 208; i <= 253; i++)
                {
                    Console.WriteLine(@"Reading page {0}.", i);
                    var page = pdfDocument.GetPage(i);

                    if (pdfProcessor == null)
                    {
                        customFilter       = new CustomFilter(page.GetArtBox());
                        extractionStrategy = listener.AttachEventListener(new SimpleTextExtractionStrategy(), customFilter);
                        pdfProcessor       = new PdfCanvasProcessor(listener);
                    }

                    pdfProcessor.ProcessPageContent(page);
                    stringBuilder.AppendLine(extractionStrategy.GetResultantText());

                    if (Program.TextWithWidth.Last.Value.Equals(Program.TextWithWidth.Last.Previous.Value))
                    {
                        Debugger.Break();
                    }
                }

                extractedText = stringBuilder.ToString();
            }

            //m_parser = new TextToMonsterDataParser(extractedText);
            //var monsters = m_parser.GetMonsterData();


            Console.WriteLine("Read all text.");
            Console.ReadLine();

            if (System.IO.File.Exists(@"AllLines.txt"))
            {
                System.IO.File.Delete(@"AllLines.txt");
            }

            if (System.IO.File.Exists(@"EntireText.txt"))
            {
                System.IO.File.Delete(@"EntireText.txt");
            }

            Console.WriteLine("Printing text.");
            System.IO.File.WriteAllText(@"EntireText.txt", extractedText);

            Console.WriteLine("Found {0} duplicates.", TextWithWidth.Count(n => n.String.Equals("die’")));

            Console.WriteLine("Printing lines.");


            using (System.IO.StreamWriter file = new System.IO.StreamWriter(@"AllLines.txt", false))
            {
                var node = TextWithWidth.First;
                while (node != null)
                {
                    file.WriteLine("{0}({1})", node.Value.String, node.Value.Width);
                    node = node.Next;
                }
            }

            Console.ReadLine();
        }