static void Main(string[] args) { PdfReader reader = new PdfReader(@"D:\191.pdf"); IEnumerable <string> GetColumnText(float llx, float lly, float urx, float ury) { int get_PageNum = reader.NumberOfPages; for (int pagecount = 1; pagecount <= get_PageNum; pagecount++) { var rect = new iTextSharp.text.Rectangle(llx, lly, urx, ury); var renderFilter = new RenderFilter[1]; renderFilter[0] = new RegionTextRenderFilter(rect); var textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter); var text = PdfTextExtractor.GetTextFromPage(reader, pagecount, textExtractionStrategy); yield return(text); } } foreach (string result in GetColumnText(0, 0, 500, 500000)) { Console.Write("{0} ", result); Console.ReadLine(); } }
public List <string> getTextByCoOrdinate(PdfReader reader, int pageNumber, int cordinate1, int coordinate2, int coordinate3, int coordinate4) { List <string> data = new List <string>(); iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(cordinate1, coordinate2, coordinate3, coordinate4); RenderFilter[] renderFilter = new RenderFilter[1]; renderFilter[0] = new RegionTextRenderFilter(rect); ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter); string text = PdfTextExtractor.GetTextFromPage(reader, pageNumber, textExtractionStrategy); string[] words = text.Split('\n'); foreach (var x in words) { if (!string.IsNullOrWhiteSpace(x)) { data.Add(x.Trim()); } } foreach (var y in data) { Console.WriteLine(y); } return(data); }
public static string GetRectangle(PdfReader reader, int pageNumber, float llx, float lly, float urx, float ury) { iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(llx, lly, urx, ury); RenderFilter[] renderFilter = new RenderFilter[1]; renderFilter[0] = new RegionTextRenderFilter(rect); ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter); return(PdfTextExtractor.GetTextFromPage(reader, pageNumber, textExtractionStrategy)); }
private string[] ExtractCurrentColumnFromPage(Rectangle column, PdfReader reader, int page) { var renderFilter = new RegionTextRenderFilter(column); var renderFilterArray = new RenderFilter[] { renderFilter }; var filteredTextRenderListener = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilterArray); var textFromColumn = PdfTextExtractor.GetTextFromPage(reader, page, filteredTextRenderListener); return(SplitColumnTextIntoRows(textFromColumn)); }
public string GetStringValueFromRegion(string file, iTextSharp.text.Rectangle rectangle) { var reader = new PdfReader(file); var renderFilter = new RenderFilter[1]; renderFilter[0] = new RegionTextRenderFilter(rectangle); ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter); return(PdfTextExtractor.GetTextFromPage(reader, 1, textExtractionStrategy)); }
public void ToggleDrawFilter(RenderFilter toggle) { if ((DrawFilter & toggle) > 0) { DrawFilter &= ~toggle; } else { DrawFilter |= toggle; } }
public static ITextExtractionStrategy MakeRectangle(float pixelDistanceFromLeft, float pixelDistanceFromBottom, float pixelDistanceWidth, float pixelDistanceHeight) { var rectangle = new System.util.RectangleJ(pixelDistanceFromLeft, pixelDistanceFromBottom, pixelDistanceWidth, pixelDistanceHeight); var filters = new RenderFilter[1]; filters[0] = new RegionTextRenderFilter(rectangle); ITextExtractionStrategy strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filters); return(strategy); }
private static string GetColumnText(PdfReader reader, int pageNum, float llx, float lly, float urx, float ury) { // reminder, parameters are in points, and 1 in = 2.54 cm = 72 points var rect = new iTextSharp.text.Rectangle(llx, lly, urx, ury); var renderFilter = new RenderFilter[1]; renderFilter[0] = new RegionTextRenderFilter(rect); var textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter); var text = PdfTextExtractor.GetTextFromPage(reader, pageNum, textExtractionStrategy); return(text); }
private void ReadText() { //List<string> linestringlist = new List<string>(); //PdfReader reader = new PdfReader(pdfFileName); iTextSharp.text.Rectangle rectA = new iTextSharp.text.Rectangle(coordinate1a, coordinate2a, coordinate3a, coordinate4a); iTextSharp.text.Rectangle rectB = new iTextSharp.text.Rectangle(coordinate1b, coordinate2b, coordinate3b, coordinate4b); RenderFilter[] renderFilter = new RenderFilter[2]; renderFilter[0] = new RegionTextRenderFilter(rectA); renderFilter[1] = new RegionTextRenderFilter(rectB); ITextExtractionStrategy textExtractionStrategyA = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter[0]); ITextExtractionStrategy textExtractionStrategyB = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter[1]); textA = PdfTextExtractor.GetTextFromPage(reader, 1, textExtractionStrategyA); textB = PdfTextExtractor.GetTextFromPage(reader, 1, textExtractionStrategyB); }
/// <summary> /// Extracts text from PDF file. (only selectable text content from the pages, not OCR from images) /// </summary> /// <param name="filepath">input file path</param> /// <param name="zone">Rectangle which specifies the zone where the text is extracted from a page. if it's null, then the full page is processed.</param> /// <param name="pages">List of pages to extract data from. If null or first item is 0, all pages will be extracted.</param> /// <returns>a list of strings. one string from each page</returns> public static List <string> GetPdfTextFromPages(string filepath, RectangleF?zone = null, List <int> pages = null) { using (PdfReader reader = new PdfReader(filepath)) { List <string> result = new List <string>(); if (pages == null || pages.First() == 0) //then read all pages { pages = Enumerable.Range(1, reader.NumberOfPages).ToList(); //create the list of all pagenumbers in the actual PDF } foreach (var i in pages) { if (i > reader.NumberOfPages) { continue; } if (zone.HasValue) { //zone based text extract float x = Utilities.MillimetersToPoints(zone.Value.X); float y = Utilities.MillimetersToPoints(zone.Value.Y); float w = Utilities.MillimetersToPoints(zone.Value.Width); float h = Utilities.MillimetersToPoints(zone.Value.Height); var pagesize = reader.GetPageSizeWithRotation(i); iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(x, pagesize.Top - y, x + w, pagesize.Top - y - h); //tanslate coordinates to iText RenderFilter[] renderFilter = new RenderFilter[1]; renderFilter[0] = new RegionTextRenderFilter(rect); ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter); string text = PdfTextExtractor.GetTextFromPage(reader, i, textExtractionStrategy); result.Add(text.Replace("\n", Environment.NewLine)); } else { //full page text extract string text = PdfTextExtractor.GetTextFromPage(reader, i); result.Add(text.Replace("\n", Environment.NewLine)); } } reader.Close(); return(result); } }
public void ProcessSceneVisibility(RenderFilter filter, DrawGroup dispGroup) { bool alwaysVis = dispGroup != null ? dispGroup.AlwaysVisible : true; for (int i = 0; i < SYSTEM_SIZE; i++) { if (cCulled[i]) { continue; } if ((cSceneVis[i]._renderFilter & filter) == 0) { cCulled[i] = true; continue; } if (!alwaysVis && cSceneVis[i]._drawGroup != null && !cSceneVis[i]._drawGroup.IsInDisplayGroup(dispGroup)) { cCulled[i] = true; } } }
public string getParagraphByCoOrdinate(string filepath, int pageno, int cordinate1, int coordinate2, int coordinate3, int coordinate4, bool filter) { PdfReader reader = new PdfReader(filepath); if (filter == false) { iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(cordinate1, 1000 - coordinate2, coordinate3, 1000 - coordinate4); RenderFilter[] renderFilter = new RenderFilter[1]; renderFilter[0] = new RegionTextRenderFilter(rect); ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter); string text = PdfTextExtractor.GetTextFromPage(reader, pageno, textExtractionStrategy); return(text); } else { iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(cordinate1, coordinate2, coordinate3, coordinate4); RenderFilter[] renderFilter = new RenderFilter[1]; renderFilter[0] = new RegionTextRenderFilter(rect); ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter); string text = PdfTextExtractor.GetTextFromPage(reader, pageno, textExtractionStrategy); return(text); } }
//static public Dictionary<string, string> ExtractTexts(this PdfReader pr, int pageI, Dictionary<string, RenderFilter[]> fieldNames2filters) //{ // Dictionary<string, string> fieldNames2texts = new Dictionary<string, string>(); // foreach (string fn in fieldNames2filters.Keys) // fieldNames2texts[fn] = pr.ExtractText(pageI, fieldNames2filters[fn]); // return fieldNames2texts; //} public static string ExtractText(this PdfReader pr, int pageI, float x, float y, float w, float h) { RenderFilter[] rf = new RenderFilter[] { new RegionTextRenderFilter(new System.util.RectangleJ(x, y, w, h)) }; return(ExtractText(pr, pageI, rf)); }
public static void ExtractText(string fileName, string outFileName) { /* pour extraire seulement les 3 1ères pages * PdfReader pdfReader1 = new PdfReader("AN_PAI_12022018_1M.pdf"); * Document document = new Document(); * * PdfCopy copy = new PdfCopy(document, new FileStream("splitpaie1-3.pdf", FileMode.Create)); * document.Open(); * for (int page = 1; page <= 3; page++) * { * document.NewPage(); * copy.AddPage(copy.GetImportedPage(pdfReader1, page)); * } * document.Close();*/ StreamWriter outFile = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8); StreamWriter outFile2 = new StreamWriter("filtered " + outFileName, false, System.Text.Encoding.UTF8); PdfReader pdfReader = new PdfReader(fileName); for (int page = 1; page <= pdfReader.NumberOfPages; page++) { Rectangle psize = pdfReader.GetPageSize(page); ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); //ITextExtractionStrategy strategy = new LocationTextExtractionStrategy(); string id1, id2, netRegex, netLocation, idLocation; netRegex = id1 = id2 = ""; string currentPageText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy); outFile.Write(currentPageText); var match = Regex.Match(currentPageText, "Matricule : *([0-9]*) SS : ([0-9]*)"); if (match.Success) { id1 = match.Groups[1].Value; id2 = match.Groups[2].Value; outFile2.Write("Matricule : " + id1 + " SS : " + id2); } outFile2.Write(";"); iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(400, 842 - 128, 480, 848 - 110); RenderFilter[] renderFilter = new RenderFilter[1]; renderFilter[0] = new RegionTextRenderFilter(rect); ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter); idLocation = PdfTextExtractor.GetTextFromPage(pdfReader, page, textExtractionStrategy); outFile2.Write(";"); outFile2.Write(idLocation); outFile2.Write(";"); if ("Matricule : " + id1 + " SS : " + id2 != idLocation.Trim()) { int h = 0; } //match = Regex.Match(currentPageText, "Pér *([0-9]*) *([0-9]*) *([0-9]*)"); match = Regex.Match(currentPageText, "Net imposable(.*)Net imposable"); if (match.Success) { netRegex = match.Groups[1].Value; outFile2.Write(netRegex); } rect = new iTextSharp.text.Rectangle(150, 240, 210, 250); renderFilter = new RenderFilter[1]; renderFilter[0] = new RegionTextRenderFilter(rect); textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter); netLocation = PdfTextExtractor.GetTextFromPage(pdfReader, page, textExtractionStrategy); rect = new iTextSharp.text.Rectangle(400, 842 - 128, 480, 848 - 110); renderFilter = new RenderFilter[1]; renderFilter[0] = new RegionTextRenderFilter(rect); textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategyEx(), renderFilter); string matriculeLocation = PdfTextExtractor.GetTextFromPage(pdfReader, page, textExtractionStrategy); outFile2.Write(";"); outFile2.Write(netLocation); if (netRegex.Trim() != netLocation.Trim()) { int x = 0; //page 259, rappel période antérieure, pas de net imposable de période } if (netLocation == "") { int z = 0; //page 56, normal, sur plusieurs pages } outFile2.WriteLine(); } pdfReader.Close(); outFile.Close(); outFile2.Close(); }
private void Scrape(CancellationToken token, string path) { PdfReader reader = new PdfReader(path); string sUri = ""; try { // Pagination for (var i = 1; i <= reader.NumberOfPages; i++) { //get current page var pageDict = reader.GetPageN(i); //get all annotations from current page var annotArray = (PdfArray)PdfReader.GetPdfObject(pageDict.Get(PdfName.ANNOTS)); //ensure array isn't empty if (annotArray == null) { continue; } if (annotArray.Length <= 0) { continue; } // check every annotation on the page foreach (var annot in annotArray.ArrayList) { //convert the iTextSharp-specific object to a generic PDF object var annotDict = (PdfDictionary)PdfReader.GetPdfObject(annot); //ensure the object isnt empty if (annotDict == null) { continue; } //get the annotation subtype and ensure it is a link var subtype = annotDict.Get(PdfName.SUBTYPE).ToString(); Log("Subtype: " + subtype); if (subtype != "/Link") { continue; } //get the annotations ACTION var linkDict = (PdfDictionary)annotDict.GetDirectObject(PdfName.A); if (linkDict == null) { continue; } if (!linkDict.Get(PdfName.S).Equals(PdfName.GOTO)) { //get the link from the annotation sUri = linkDict.Get(PdfName.URI).ToString(); Log("URI: " + sUri); if (String.IsNullOrEmpty(sUri)) { continue; } } //build the link address into a string string linkTextBuilder; //create a rectangle, define its paramteres, read the text under the rectangle (the anchor text for the link) and write it to the string var LinkLocation = annotDict.GetAsArray(PdfName.RECT); List <string> linestringlist = new List <string>(); iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(((PdfNumber)LinkLocation[0]).FloatValue, ((PdfNumber)LinkLocation[1]).FloatValue, ((PdfNumber)LinkLocation[2]).FloatValue, ((PdfNumber)LinkLocation[3]).FloatValue); RenderFilter[] renderFilter = new RenderFilter[1]; renderFilter[0] = new RegionTextRenderFilter(rect); ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter); linkTextBuilder = PdfTextExtractor.GetTextFromPage(reader, i, textExtractionStrategy).Trim(); Log(linkTextBuilder); String sUriHTTPSPrefix = sUri.Substring(0, 5); String sUriWWWPrefix = sUri.Substring(0, 3); if (sUriHTTPSPrefix.Equals("https")) { Log("Prefix: " + sUriHTTPSPrefix); } else if (sUriWWWPrefix.Equals("www")) { Log("Prefix: " + sUriWWWPrefix); } if (sUri.Length.Equals(70) && sUriHTTPSPrefix.Equals("https")) { //instantiate the web request and response objects WebRequest httpReq = WebRequest.Create(sUri); WebResponse response = httpReq.GetResponse(); //check website response from request and save status to string string webStatus = ((HttpWebResponse)response).StatusDescription.ToString(); Log("Webstatus: " + webStatus); //check website response url from request and save url to string string responseURL = ((HttpWebResponse)response).ResponseUri.ToString(); Log("Response URI: " + responseURL); //split the response url string to just sku number after the "=" string webSite = responseURL.Split('=')[1]; Log("Response SKU: " + webSite); //split the link harvested from the annotations in the pdf after the "=" string sku = sUri.Split('=')[1]; Log("PDF SKU: " + sku); //truncate the split string to just the sku (removes any extra symbols, such as copywright, which were captured in the rectangle) string finalSku = sku.Substring(0, 7); Log("PDF SKU Final: " + finalSku); //delete asteriks from sku var deleteChars = new string[] { "*" }; foreach (var c in deleteChars) { linkTextBuilder = finalSku.Replace(c, string.Empty); } //truncate the split string to just the sku (removes any extra symbols, such as copywright, which were captured in the rectangle) string linkText = linkTextBuilder.Substring(0, 7); Log("Link SKU Final: " + linkText); //default status of string match is "NO MATCH" string match = "\tNO MATCH"; //make a blank IPEndPoint IPEndPoint remoteEP = null; //create a new httpwebrequest HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(sUri); //bind IPEndPoint from the remote end point to a variable req.ServicePoint.BindIPEndPointDelegate = delegate(ServicePoint servicePoint, IPEndPoint remoteEndPoint, int retryCount) { remoteEP = remoteEndPoint; return(null); }; //get the response from the request req.GetResponse(); Log("HTTP response: " + req.GetResponse()); //set remoteEndPoint to string for display string hostIP = remoteEP.Address.ToString(); Log("Host IP: " + hostIP); //stop the request to make way for a new request on next iteration req.Abort(); //if sku from PDF link, website response link, and pdf anchor text all match then change match if (linkText.Equals(sku) && webSite.Equals(sku)) { match = "MATCH"; } //add data to datagridview this.skuGrid.Rows.Add(i, sku, linkTextBuilder, webSite, match, webStatus, hostIP); //close http request and response response.Close(); if (token.IsCancellationRequested) { // Clean up here, then... token.ThrowIfCancellationRequested(); Log("Cancellation token: " + token); MessageBox.Show("Scrape stopped"); throbber.Hide(); skuGrid.Show(); } } else if (!sUri.Length.Equals(70) && sUriWWWPrefix.Equals("www") || !sUri.Length.Equals(70) && sUriHTTPSPrefix.Equals("https")) { //instantiate the web request and response objects WebRequest httpReq = WebRequest.Create(sUri); WebResponse response = httpReq.GetResponse(); //check website response from request and save status to string string webStatus = ((HttpWebResponse)response).StatusDescription.ToString(); Log("Web Status: " + webStatus); //check website response url from request and save url to string string responseURL = ((HttpWebResponse)response).ResponseUri.ToString(); Log("Response URI: " + responseURL); //split the response url string to just sku number after the "=" string webSite = responseURL; Log("Response SKU: " + webSite); //default status of string match is "NO MATCH" string match = "\tNO MATCH"; //make a blank IPEndPoint IPEndPoint remoteEP = null; //create a new httpwebrequest HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(sUri); //bind IPEndPoint from the remote end point to a variable req.ServicePoint.BindIPEndPointDelegate = delegate(ServicePoint servicePoint, IPEndPoint remoteEndPoint, int retryCount) { remoteEP = remoteEndPoint; return(null); }; //get the response from the request req.GetResponse(); Log("HTTP Response: " + req.GetResponse()); //set remoteEndPoint to string for display string hostIP = remoteEP.Address.ToString(); Log("Host IP: " + hostIP); //stop the request to make way for a new request on next iteration req.Abort(); //delete asteriks from sku var deleteChars = new string[] { "*" }; foreach (var c in deleteChars) { linkTextBuilder = linkTextBuilder.Replace(c, string.Empty); } //if sku from PDF link, website response link, and pdf anchor text all match then change match if (linkTextBuilder.Equals(webSite)) { match = "MATCH"; } //add data to datagridview this.skuGrid.Rows.Add(i, null, linkTextBuilder, webSite, match, webStatus, hostIP); //close http request and response response.Close(); if (token.IsCancellationRequested) { // Clean up here, then... token.ThrowIfCancellationRequested(); Log("Cancellation token: " + token); MessageBox.Show("Scrape stopped"); throbber.Hide(); skuGrid.Show(); } } } } MessageBox.Show("Scrape complete"); throbber.Hide(); skuGrid.Show(); } catch (Exception ex) { MessageBox.Show(ex.Message); Log("EXCEPTION ERROR: " + ex + " " + ex.HelpLink); throbber.Hide(); skuGrid.Show(); } finally { taskIsRunning = false; //update grid skuGrid.Update(); //close PDF reader reader.Close(); WriteLog(log); } }