示例#1
0
        static void Main(string[] args)
        {
            PdfReader reader = new PdfReader(@"D:\191.pdf");

            IEnumerable <string> GetColumnText(float llx, float lly, float urx, float ury)

            {
                int get_PageNum = reader.NumberOfPages;

                for (int pagecount = 1; pagecount <= get_PageNum; pagecount++)
                {
                    var rect         = new iTextSharp.text.Rectangle(llx, lly, urx, ury);
                    var renderFilter = new RenderFilter[1];
                    renderFilter[0] = new RegionTextRenderFilter(rect);
                    var textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter);
                    var text = PdfTextExtractor.GetTextFromPage(reader, pagecount, textExtractionStrategy);
                    yield return(text);
                }
            }

            foreach (string result in GetColumnText(0, 0, 500, 500000))
            {
                Console.Write("{0} ", result);
                Console.ReadLine();
            }
        }
示例#2
0
        public List <string> getTextByCoOrdinate(PdfReader reader, int pageNumber, int cordinate1, int coordinate2, int coordinate3, int coordinate4)
        {
            List <string> data = new List <string>();

            iTextSharp.text.Rectangle rect         = new iTextSharp.text.Rectangle(cordinate1, coordinate2, coordinate3, coordinate4);
            RenderFilter[]            renderFilter = new RenderFilter[1];
            renderFilter[0] = new RegionTextRenderFilter(rect);
            ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter);

            string text = PdfTextExtractor.GetTextFromPage(reader, pageNumber, textExtractionStrategy);

            string[] words = text.Split('\n');

            foreach (var x in words)
            {
                if (!string.IsNullOrWhiteSpace(x))
                {
                    data.Add(x.Trim());
                }
            }

            foreach (var y in data)
            {
                Console.WriteLine(y);
            }

            return(data);
        }
示例#3
0
        public static string GetRectangle(PdfReader reader, int pageNumber, float llx, float lly, float urx, float ury)
        {
            iTextSharp.text.Rectangle rect         = new iTextSharp.text.Rectangle(llx, lly, urx, ury);
            RenderFilter[]            renderFilter = new RenderFilter[1];
            renderFilter[0] = new RegionTextRenderFilter(rect);
            ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter);

            return(PdfTextExtractor.GetTextFromPage(reader, pageNumber, textExtractionStrategy));
        }
        private string[] ExtractCurrentColumnFromPage(Rectangle column, PdfReader reader, int page)
        {
            var renderFilter      = new RegionTextRenderFilter(column);
            var renderFilterArray = new RenderFilter[] { renderFilter };

            var filteredTextRenderListener = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilterArray);
            var textFromColumn             = PdfTextExtractor.GetTextFromPage(reader, page, filteredTextRenderListener);

            return(SplitColumnTextIntoRows(textFromColumn));
        }
示例#5
0
        public string GetStringValueFromRegion(string file, iTextSharp.text.Rectangle rectangle)
        {
            var reader       = new PdfReader(file);
            var renderFilter = new RenderFilter[1];

            renderFilter[0] = new RegionTextRenderFilter(rectangle);
            ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter);

            return(PdfTextExtractor.GetTextFromPage(reader, 1, textExtractionStrategy));
        }
示例#6
0
 public void ToggleDrawFilter(RenderFilter toggle)
 {
     if ((DrawFilter & toggle) > 0)
     {
         DrawFilter &= ~toggle;
     }
     else
     {
         DrawFilter |= toggle;
     }
 }
示例#7
0
        public static ITextExtractionStrategy MakeRectangle(float pixelDistanceFromLeft, float pixelDistanceFromBottom, float pixelDistanceWidth, float pixelDistanceHeight)
        {
            var rectangle = new System.util.RectangleJ(pixelDistanceFromLeft, pixelDistanceFromBottom, pixelDistanceWidth, pixelDistanceHeight);

            var filters = new RenderFilter[1];

            filters[0] = new RegionTextRenderFilter(rectangle);

            ITextExtractionStrategy strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filters);

            return(strategy);
        }
示例#8
0
        private static string GetColumnText(PdfReader reader, int pageNum, float llx, float lly, float urx, float ury)
        {
            // reminder, parameters are in points, and 1 in = 2.54 cm = 72 points
            var rect         = new iTextSharp.text.Rectangle(llx, lly, urx, ury);
            var renderFilter = new RenderFilter[1];

            renderFilter[0] = new RegionTextRenderFilter(rect);
            var textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter);
            var text = PdfTextExtractor.GetTextFromPage(reader, pageNum, textExtractionStrategy);

            return(text);
        }
示例#9
0
        private void ReadText()
        {
            //List<string> linestringlist = new List<string>();
            //PdfReader reader = new PdfReader(pdfFileName);
            iTextSharp.text.Rectangle rectA        = new iTextSharp.text.Rectangle(coordinate1a, coordinate2a, coordinate3a, coordinate4a);
            iTextSharp.text.Rectangle rectB        = new iTextSharp.text.Rectangle(coordinate1b, coordinate2b, coordinate3b, coordinate4b);
            RenderFilter[]            renderFilter = new RenderFilter[2];
            renderFilter[0] = new RegionTextRenderFilter(rectA);
            renderFilter[1] = new RegionTextRenderFilter(rectB);
            ITextExtractionStrategy textExtractionStrategyA = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter[0]);
            ITextExtractionStrategy textExtractionStrategyB = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter[1]);

            textA = PdfTextExtractor.GetTextFromPage(reader, 1, textExtractionStrategyA);
            textB = PdfTextExtractor.GetTextFromPage(reader, 1, textExtractionStrategyB);
        }
        /// <summary>
        /// Extracts text from PDF file. (only selectable text content from the pages, not OCR from images)
        /// </summary>
        /// <param name="filepath">input file path</param>
        /// <param name="zone">Rectangle which specifies the zone where the text is extracted from a page. if it's null, then the full page is processed.</param>
        /// <param name="pages">List of pages to extract data from. If null or first item is 0, all pages will be extracted.</param>
        /// <returns>a list of strings. one string from each page</returns>
        public static List <string> GetPdfTextFromPages(string filepath, RectangleF?zone = null, List <int> pages = null)
        {
            using (PdfReader reader = new PdfReader(filepath))
            {
                List <string> result = new List <string>();

                if (pages == null || pages.First() == 0)                        //then read all pages
                {
                    pages = Enumerable.Range(1, reader.NumberOfPages).ToList(); //create the list of all pagenumbers in the actual PDF
                }

                foreach (var i in pages)
                {
                    if (i > reader.NumberOfPages)
                    {
                        continue;
                    }

                    if (zone.HasValue)
                    { //zone based text extract
                        float x = Utilities.MillimetersToPoints(zone.Value.X);
                        float y = Utilities.MillimetersToPoints(zone.Value.Y);
                        float w = Utilities.MillimetersToPoints(zone.Value.Width);
                        float h = Utilities.MillimetersToPoints(zone.Value.Height);

                        var pagesize = reader.GetPageSizeWithRotation(i);
                        iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(x, pagesize.Top - y, x + w, pagesize.Top - y - h); //tanslate coordinates to iText

                        RenderFilter[] renderFilter = new RenderFilter[1];
                        renderFilter[0] = new RegionTextRenderFilter(rect);
                        ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter);
                        string text = PdfTextExtractor.GetTextFromPage(reader, i, textExtractionStrategy);
                        result.Add(text.Replace("\n", Environment.NewLine));
                    }
                    else
                    { //full page text extract
                        string text = PdfTextExtractor.GetTextFromPage(reader, i);
                        result.Add(text.Replace("\n", Environment.NewLine));
                    }
                }



                reader.Close();
                return(result);
            }
        }
示例#11
0
        public void ProcessSceneVisibility(RenderFilter filter, DrawGroup dispGroup)
        {
            bool alwaysVis = dispGroup != null ? dispGroup.AlwaysVisible : true;

            for (int i = 0; i < SYSTEM_SIZE; i++)
            {
                if (cCulled[i])
                {
                    continue;
                }

                if ((cSceneVis[i]._renderFilter & filter) == 0)
                {
                    cCulled[i] = true;
                    continue;
                }

                if (!alwaysVis && cSceneVis[i]._drawGroup != null && !cSceneVis[i]._drawGroup.IsInDisplayGroup(dispGroup))
                {
                    cCulled[i] = true;
                }
            }
        }
示例#12
0
        public string getParagraphByCoOrdinate(string filepath, int pageno, int cordinate1, int coordinate2, int coordinate3, int coordinate4, bool filter)
        {
            PdfReader reader = new PdfReader(filepath);

            if (filter == false)
            {
                iTextSharp.text.Rectangle rect         = new iTextSharp.text.Rectangle(cordinate1, 1000 - coordinate2, coordinate3, 1000 - coordinate4);
                RenderFilter[]            renderFilter = new RenderFilter[1];
                renderFilter[0] = new RegionTextRenderFilter(rect);
                ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter);
                string text = PdfTextExtractor.GetTextFromPage(reader, pageno, textExtractionStrategy);
                return(text);
            }
            else
            {
                iTextSharp.text.Rectangle rect         = new iTextSharp.text.Rectangle(cordinate1, coordinate2, coordinate3, coordinate4);
                RenderFilter[]            renderFilter = new RenderFilter[1];
                renderFilter[0] = new RegionTextRenderFilter(rect);
                ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter);
                string text = PdfTextExtractor.GetTextFromPage(reader, pageno, textExtractionStrategy);
                return(text);
            }
        }
示例#13
0
        //static public Dictionary<string, string> ExtractTexts(this PdfReader pr, int pageI, Dictionary<string, RenderFilter[]> fieldNames2filters)
        //{
        //    Dictionary<string, string> fieldNames2texts = new Dictionary<string, string>();
        //    foreach (string fn in fieldNames2filters.Keys)
        //        fieldNames2texts[fn] = pr.ExtractText(pageI, fieldNames2filters[fn]);
        //    return fieldNames2texts;
        //}

        public static string ExtractText(this PdfReader pr, int pageI, float x, float y, float w, float h)
        {
            RenderFilter[] rf = new RenderFilter[] { new RegionTextRenderFilter(new System.util.RectangleJ(x, y, w, h)) };
            return(ExtractText(pr, pageI, rf));
        }
示例#14
0
        public static void ExtractText(string fileName, string outFileName)
        {
            /* pour extraire seulement les 3 1ères pages
             * PdfReader pdfReader1 = new PdfReader("AN_PAI_12022018_1M.pdf");
             * Document document = new Document();
             *
             * PdfCopy copy = new PdfCopy(document, new FileStream("splitpaie1-3.pdf", FileMode.Create));
             * document.Open();
             * for (int page = 1; page <= 3; page++)
             * {
             *  document.NewPage();
             *  copy.AddPage(copy.GetImportedPage(pdfReader1, page));
             * }
             * document.Close();*/


            StreamWriter outFile  = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8);
            StreamWriter outFile2 = new StreamWriter("filtered " + outFileName, false, System.Text.Encoding.UTF8);

            PdfReader pdfReader = new PdfReader(fileName);

            for (int page = 1; page <= pdfReader.NumberOfPages; page++)
            {
                Rectangle psize = pdfReader.GetPageSize(page);

                ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                //ITextExtractionStrategy strategy = new LocationTextExtractionStrategy();

                string id1, id2, netRegex, netLocation, idLocation;
                netRegex = id1 = id2 = "";
                string currentPageText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);
                outFile.Write(currentPageText);
                var match = Regex.Match(currentPageText, "Matricule : *([0-9]*)  SS : ([0-9]*)");
                if (match.Success)
                {
                    id1 = match.Groups[1].Value;
                    id2 = match.Groups[2].Value;
                    outFile2.Write("Matricule : " + id1 + "  SS : " + id2);
                }
                outFile2.Write(";");

                iTextSharp.text.Rectangle rect         = new iTextSharp.text.Rectangle(400, 842 - 128, 480, 848 - 110);
                RenderFilter[]            renderFilter = new RenderFilter[1];
                renderFilter[0] = new RegionTextRenderFilter(rect);
                ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter);
                idLocation = PdfTextExtractor.GetTextFromPage(pdfReader, page, textExtractionStrategy);
                outFile2.Write(";");
                outFile2.Write(idLocation);
                outFile2.Write(";");

                if ("Matricule : " + id1 + "  SS : " + id2 != idLocation.Trim())
                {
                    int h = 0;
                }

                //match = Regex.Match(currentPageText, "Pér *([0-9]*) *([0-9]*) *([0-9]*)");
                match = Regex.Match(currentPageText, "Net imposable(.*)Net imposable");
                if (match.Success)
                {
                    netRegex = match.Groups[1].Value;
                    outFile2.Write(netRegex);
                }

                rect                   = new iTextSharp.text.Rectangle(150, 240, 210, 250);
                renderFilter           = new RenderFilter[1];
                renderFilter[0]        = new RegionTextRenderFilter(rect);
                textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter);
                netLocation            = PdfTextExtractor.GetTextFromPage(pdfReader, page, textExtractionStrategy);


                rect                   = new iTextSharp.text.Rectangle(400, 842 - 128, 480, 848 - 110);
                renderFilter           = new RenderFilter[1];
                renderFilter[0]        = new RegionTextRenderFilter(rect);
                textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategyEx(), renderFilter);
                string matriculeLocation = PdfTextExtractor.GetTextFromPage(pdfReader, page, textExtractionStrategy);

                outFile2.Write(";");
                outFile2.Write(netLocation);

                if (netRegex.Trim() != netLocation.Trim())
                {
                    int x = 0;
                    //page 259, rappel période antérieure, pas de net imposable de période
                }

                if (netLocation == "")
                {
                    int z = 0;
                    //page 56, normal, sur plusieurs pages
                }

                outFile2.WriteLine();
            }

            pdfReader.Close();
            outFile.Close();
            outFile2.Close();
        }
示例#15
0
        private void Scrape(CancellationToken token, string path)
        {
            PdfReader reader = new PdfReader(path);
            string    sUri   = "";

            try
            {
                // Pagination
                for (var i = 1; i <= reader.NumberOfPages; i++)
                {
                    //get current page
                    var pageDict = reader.GetPageN(i);

                    //get all annotations from current page
                    var annotArray = (PdfArray)PdfReader.GetPdfObject(pageDict.Get(PdfName.ANNOTS));

                    //ensure array isn't empty
                    if (annotArray == null)
                    {
                        continue;
                    }
                    if (annotArray.Length <= 0)
                    {
                        continue;
                    }

                    // check every annotation on the page
                    foreach (var annot in annotArray.ArrayList)
                    {
                        //convert the iTextSharp-specific object to a generic PDF object
                        var annotDict = (PdfDictionary)PdfReader.GetPdfObject(annot);

                        //ensure the object isnt empty
                        if (annotDict == null)
                        {
                            continue;
                        }

                        //get the annotation subtype and ensure it is a link
                        var subtype = annotDict.Get(PdfName.SUBTYPE).ToString();
                        Log("Subtype: " + subtype);
                        if (subtype != "/Link")
                        {
                            continue;
                        }


                        //get the annotations ACTION
                        var linkDict = (PdfDictionary)annotDict.GetDirectObject(PdfName.A);
                        if (linkDict == null)
                        {
                            continue;
                        }

                        if (!linkDict.Get(PdfName.S).Equals(PdfName.GOTO))
                        {
                            //get the link from the annotation
                            sUri = linkDict.Get(PdfName.URI).ToString();
                            Log("URI: " + sUri);
                            if (String.IsNullOrEmpty(sUri))
                            {
                                continue;
                            }
                        }


                        //build the link address into a string
                        string linkTextBuilder;

                        //create a rectangle, define its paramteres, read the text under the rectangle (the anchor text for the link) and write it to the string
                        var                       LinkLocation   = annotDict.GetAsArray(PdfName.RECT);
                        List <string>             linestringlist = new List <string>();
                        iTextSharp.text.Rectangle rect           = new iTextSharp.text.Rectangle(((PdfNumber)LinkLocation[0]).FloatValue, ((PdfNumber)LinkLocation[1]).FloatValue, ((PdfNumber)LinkLocation[2]).FloatValue, ((PdfNumber)LinkLocation[3]).FloatValue);
                        RenderFilter[]            renderFilter   = new RenderFilter[1];
                        renderFilter[0] = new RegionTextRenderFilter(rect);
                        ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter);
                        linkTextBuilder = PdfTextExtractor.GetTextFromPage(reader, i, textExtractionStrategy).Trim();
                        Log(linkTextBuilder);

                        String sUriHTTPSPrefix = sUri.Substring(0, 5);
                        String sUriWWWPrefix   = sUri.Substring(0, 3);

                        if (sUriHTTPSPrefix.Equals("https"))
                        {
                            Log("Prefix: " + sUriHTTPSPrefix);
                        }
                        else if (sUriWWWPrefix.Equals("www"))
                        {
                            Log("Prefix: " + sUriWWWPrefix);
                        }


                        if (sUri.Length.Equals(70) && sUriHTTPSPrefix.Equals("https"))
                        {
                            //instantiate the web request and response objects
                            WebRequest  httpReq  = WebRequest.Create(sUri);
                            WebResponse response = httpReq.GetResponse();

                            //check website response from request and save status to string
                            string webStatus = ((HttpWebResponse)response).StatusDescription.ToString();
                            Log("Webstatus: " + webStatus);

                            //check website response url from request and save url to string
                            string responseURL = ((HttpWebResponse)response).ResponseUri.ToString();
                            Log("Response URI: " + responseURL);

                            //split the response url string to just sku number after the "="
                            string webSite = responseURL.Split('=')[1];
                            Log("Response SKU: " + webSite);

                            //split the link harvested from the annotations in the pdf after the "="
                            string sku = sUri.Split('=')[1];
                            Log("PDF SKU: " + sku);

                            //truncate the split string to just the sku (removes any extra symbols, such as copywright, which were captured in the rectangle)
                            string finalSku = sku.Substring(0, 7);
                            Log("PDF SKU Final: " + finalSku);

                            //delete asteriks from sku
                            var deleteChars = new string[] { "*" };
                            foreach (var c in deleteChars)
                            {
                                linkTextBuilder = finalSku.Replace(c, string.Empty);
                            }

                            //truncate the split string to just the sku (removes any extra symbols, such as copywright, which were captured in the rectangle)
                            string linkText = linkTextBuilder.Substring(0, 7);
                            Log("Link SKU Final: " + linkText);

                            //default status of string match is "NO MATCH"
                            string match = "\tNO MATCH";

                            //make a blank IPEndPoint
                            IPEndPoint remoteEP = null;

                            //create a new httpwebrequest
                            HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(sUri);

                            //bind IPEndPoint from the remote end point to a variable
                            req.ServicePoint.BindIPEndPointDelegate = delegate(ServicePoint servicePoint, IPEndPoint remoteEndPoint, int retryCount)
                            {
                                remoteEP = remoteEndPoint;
                                return(null);
                            };

                            //get the response from the request
                            req.GetResponse();
                            Log("HTTP response: " + req.GetResponse());

                            //set remoteEndPoint to string for display
                            string hostIP = remoteEP.Address.ToString();
                            Log("Host IP: " + hostIP);

                            //stop the request to make way for a new request on next iteration
                            req.Abort();

                            //if sku from PDF link, website response link, and pdf anchor text all match then change match
                            if (linkText.Equals(sku) && webSite.Equals(sku))
                            {
                                match = "MATCH";
                            }

                            //add data to datagridview
                            this.skuGrid.Rows.Add(i, sku, linkTextBuilder, webSite, match, webStatus, hostIP);
                            //close http request and response
                            response.Close();

                            if (token.IsCancellationRequested)
                            {
                                // Clean up here, then...
                                token.ThrowIfCancellationRequested();
                                Log("Cancellation token: " + token);
                                MessageBox.Show("Scrape stopped");
                                throbber.Hide();
                                skuGrid.Show();
                            }
                        }
                        else if (!sUri.Length.Equals(70) && sUriWWWPrefix.Equals("www") ||
                                 !sUri.Length.Equals(70) && sUriHTTPSPrefix.Equals("https"))
                        {
                            //instantiate the web request and response objects
                            WebRequest  httpReq  = WebRequest.Create(sUri);
                            WebResponse response = httpReq.GetResponse();

                            //check website response from request and save status to string
                            string webStatus = ((HttpWebResponse)response).StatusDescription.ToString();
                            Log("Web Status: " + webStatus);

                            //check website response url from request and save url to string
                            string responseURL = ((HttpWebResponse)response).ResponseUri.ToString();
                            Log("Response URI: " + responseURL);

                            //split the response url string to just sku number after the "="
                            string webSite = responseURL;
                            Log("Response SKU: " + webSite);

                            //default status of string match is "NO MATCH"
                            string match = "\tNO MATCH";

                            //make a blank IPEndPoint
                            IPEndPoint remoteEP = null;

                            //create a new httpwebrequest
                            HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(sUri);

                            //bind IPEndPoint from the remote end point to a variable
                            req.ServicePoint.BindIPEndPointDelegate = delegate(ServicePoint servicePoint, IPEndPoint remoteEndPoint, int retryCount)
                            {
                                remoteEP = remoteEndPoint;
                                return(null);
                            };

                            //get the response from the request
                            req.GetResponse();
                            Log("HTTP Response: " + req.GetResponse());

                            //set remoteEndPoint to string for display
                            string hostIP = remoteEP.Address.ToString();
                            Log("Host IP: " + hostIP);

                            //stop the request to make way for a new request on next iteration
                            req.Abort();

                            //delete asteriks from sku
                            var deleteChars = new string[] { "*" };
                            foreach (var c in deleteChars)
                            {
                                linkTextBuilder = linkTextBuilder.Replace(c, string.Empty);
                            }

                            //if sku from PDF link, website response link, and pdf anchor text all match then change match
                            if (linkTextBuilder.Equals(webSite))
                            {
                                match = "MATCH";
                            }

                            //add data to datagridview
                            this.skuGrid.Rows.Add(i, null, linkTextBuilder, webSite, match, webStatus, hostIP);
                            //close http request and response
                            response.Close();

                            if (token.IsCancellationRequested)
                            {
                                // Clean up here, then...
                                token.ThrowIfCancellationRequested();
                                Log("Cancellation token: " + token);
                                MessageBox.Show("Scrape stopped");
                                throbber.Hide();
                                skuGrid.Show();
                            }
                        }
                    }
                }
                MessageBox.Show("Scrape complete");
                throbber.Hide();
                skuGrid.Show();
            }
            catch (Exception ex)
            {
                MessageBox.Show(ex.Message);
                Log("EXCEPTION ERROR: " + ex + " " + ex.HelpLink);
                throbber.Hide();
                skuGrid.Show();
            }
            finally
            {
                taskIsRunning = false;

                //update grid
                skuGrid.Update();

                //close PDF reader
                reader.Close();

                WriteLog(log);
            }
        }