//Extract word by location(rect)/ string GetTextByLocation(int page, RectangleJ area, bool landscape) { const float dpi = 72.0f; float landscapeHeight = 8.23f; RectangleJ location = new RectangleJ(area.X, area.Y, area.Width, area.Height); if (landscape) { location.X = landscapeHeight - area.Y - area.Height; location.Y = area.X; location.Width = area.Height; location.Height = area.Width; } location.X *= dpi; location.Y *= dpi; location.Width *= dpi; location.Height *= dpi; RenderFilter[] filter = { new RegionTextRenderFilter(location) }; ITextExtractionStrategy strategy; StringBuilder text = new StringBuilder(); using (PdfReader reader = new PdfReader(filepath)) { strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter); text.AppendLine(PdfTextExtractor.GetTextFromPage(reader, page, strategy)); } return(text.ToString()); }
public bool Accept(LocationTextExtractionStrategy.TextChunk textChunk) { var rectJ = new RectangleJ(Rect); return(rectJ.Contains(textChunk.StartLocation[0], textChunk.StartLocation[1]) && rectJ.Contains(textChunk.EndLocation[0], textChunk.EndLocation[1])); }
/******/ public List <string[]> GetTableCells(int page, RectangleJ area, float rowHeight, float rowHeightOffset, float[] columnWidth, bool landscape) { List <string[]> result = new List <string[]>(); List <RectangleJ> rows = new List <RectangleJ>(); rows = GetRowArea(page, area, rowHeight, rowHeightOffset); foreach (RectangleJ row in rows) { result.Add(GetColumns(page, row, columnWidth, landscape)); } /* Debugging */ /* * int i = 0; * foreach (string[] row in result) { * Console.WriteLine("Row: " + i++); * for (int j = 0; j < 150; j++) * { * Console.Write("*"); * } * Console.WriteLine(); * int colNo = 0; * foreach (string column in row) * Console.WriteLine("Column {0}: {1}", colNo++, column); * Console.WriteLine(); * } * /* END Debugging */ return(result); }
bool CheckCompanyName(string stamp, RectangleJ stampLocation, bool landscape) { Regex regex = new Regex(@stamp); Match match = regex.Match(GetTextByLocation(1, stampLocation, landscape)); return(match.Success); }
/** * @return null if the intersection is empty, {@link com.itextpdf.text.Rectangle} representing intersection otherwise */ private Rectangle Intersection(Rectangle rect1, Rectangle rect2) { RectangleJ awtRect1 = new RectangleJ(rect1); RectangleJ awtRect2 = new RectangleJ(rect2); RectangleJ awtIntersection = awtRect1.Intersection(awtRect2); return(awtIntersection.IsEmpty() ? null : new Rectangle(awtIntersection)); }
public RectangleJ Intersection(RectangleJ r) { float x1 = Math.Max(x, r.x); float y1 = Math.Max(y, r.y); float x2 = Math.Min(x + width, r.x + r.width); float y2 = Math.Min(y + height, r.y + r.height); return(new RectangleJ(x1, y1, x2 - x1, y2 - y1)); }
public void Add(RectangleJ rect) { float x1 = Math.Min(Math.Min(x, x + width), Math.Min(rect.x, rect.x + rect.width)); float x2 = Math.Max(Math.Max(x, x + width), Math.Max(rect.x, rect.x + rect.width)); float y1 = Math.Min(Math.Min(y, y + height), Math.Min(rect.y, rect.y + rect.height)); float y2 = Math.Max(Math.Max(y, y + height), Math.Max(rect.y, rect.y + rect.height)); x = x1; y = y1; width = x2 - x1; height = y2 - y1; }
private static bool ContainsAll(RectangleJ rect, params Point2D[] points) { foreach (Point2D point in points) { if (!rect.Contains(point)) { return(false); } } return(true); }
/** * Method invokes by the PdfContentStreamProcessor. * Passes a TextRenderInfo for every text chunk that is encountered. * We'll use this object to obtain coordinates. * @see com.itextpdf.text.pdf.parser.RenderListener#renderText(com.itextpdf.text.pdf.parser.TextRenderInfo) */ virtual public void RenderText(TextRenderInfo renderInfo) { if (textRectangle == null) { textRectangle = renderInfo.GetDescentLine().GetBoundingRectange(); } else { textRectangle.Add(renderInfo.GetDescentLine().GetBoundingRectange()); } textRectangle.Add(renderInfo.GetAscentLine().GetBoundingRectange()); }
List <RectangleJ> GetRowArea(int page, RectangleJ area, float rowHeight, float rowHeightOffset) /* Gets row areas from a given area in one page */ { List <RectangleJ> result = new List <RectangleJ>(); float x = area.X; float y = area.Y + area.Height - rowHeight; for (; y >= area.Y; y -= rowHeight) { result.Add(new RectangleJ(x, y, area.Width, rowHeight + 2 * rowHeightOffset)); } return(result); }
string[] GetColumns(int page, RectangleJ row, float[] columnWidth, bool landscape) /* Gets column text given a specified row area and column widths */ { string[] result = new string[columnWidth.Length]; RectangleJ column = new RectangleJ(row.X, row.Y, columnWidth[0], row.Height); for (int i = 0; i < columnWidth.Length; i++) { column.Width = columnWidth[i]; result[i] = GetTextByLocation(page, column, landscape).Trim(); column.X += columnWidth[i]; } return(result); }
/// <summary> /// Lê uma tabela de um pdf /// </summary> /// <param name="pdf">Caminho do PDF</param> /// <param name="origemXPag1">Inicio da leitura no eixo X para a primeira página</param> /// <param name="origemYPag1">Inicio da leitura no eixo Y para a primeira página</param> /// <param name="linhasPag1">Quantidade de linhas da primeira página</param> /// <param name="origemXOutrasPag">Inicio da leitura no eixo X para as demais páginas</param> /// <param name="origemYOutrasPag">Inicio da leitura no eixo Y para as demais páginas</param> /// <param name="linhasOutrasPag">Quantidade de linhas das demais páginas</param> /// <param name="alturaLinha">Altrura da linha</param> /// <param name="colunas">Nome e largura das colunas</param> /// <returns></returns> private List <Dictionary <string, string> > ReadTabelaPDF(string pdf, float origemXPag1, float origemYPag1, int linhasPag1, float origemXOutrasPag, float origemYOutrasPag, int linhasOutrasPag, float alturaLinha, Dictionary <string, float> colunas) { // Primeira página float origemX = origemXPag1; float origemY = origemYPag1; int quantidadeLinhas = linhasPag1; var resultado = new List <Dictionary <string, string> >(); using (iTextSharp.text.pdf.PdfReader leitor = new iTextSharp.text.pdf.PdfReader(pdf)) { var texto = string.Empty; for (int i = 1; i <= leitor.NumberOfPages; i++) { if (i > 1) { origemX = origemXOutrasPag; origemY = origemYOutrasPag; quantidadeLinhas = linhasOutrasPag; } for (int l = 0; l < quantidadeLinhas; l++) { var dados = new Dictionary <string, string>(); int c = 0; float deslocamentoX = 0; foreach (var coluna in colunas) { RectangleJ rect = new RectangleJ(origemX + deslocamentoX, origemY + (l * alturaLinha), coluna.Value, alturaLinha); iTextSharp.text.pdf.parser.RenderFilter filter = new iTextSharp.text.pdf.parser.RegionTextRenderFilter(rect); iTextSharp.text.pdf.parser.ITextExtractionStrategy strategy = new iTextSharp.text.pdf.parser.FilteredTextRenderListener(new iTextSharp.text.pdf.parser.LocationTextExtractionStrategy(), filter); texto = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(leitor, i, strategy); dados.Add(coluna.Key, texto); c++; deslocamentoX += coluna.Value; } if (dados != null) { resultado.Add(dados); } } } } return(resultado); }
int GetTransactionsPage() { RectangleJ area = new RectangleJ(1.09f, 0.45f, 8f, 6.09f); float rowHeight = 0.31f; float[] columnWidth = new float[] { 6f, 2f }; List <string[]> contents = GetTableCells(2, area, rowHeight, columnWidth, true); foreach (string[] row in contents) { if (Regex.Match(row[0], "Detailed Cash Flow").Success) { return(int.Parse(row[1])); } } return(-1); }
public string ReadFromPosistionIText() { RectangleJ rect = new RectangleJ(0, 0, 2000, 1800); RenderFilter[] filter = { new RegionTextRenderFilter(rect) }; ITextExtractionStrategy strategy; StringBuilder sb = new StringBuilder(); for (int i = 1; i <= Document.NumberOfPages; i++) { strategy = new FilteredTextRenderListener( new LocationTextExtractionStrategy(), filter ); sb.AppendLine( PdfTextExtractor.GetTextFromPage(Document, i, strategy) ); } return(sb.ToString()); }
int GetRowsToHeader(int page) { RectangleJ area = GetTransactionsArea(page); area.Width = 0.73f; float[] column = new float[] { 0.73f }; int i = 1; List <string[]> rows = GetTableCells(page, area, rowHeight, column, true); foreach (string[] header in rows) { if (Regex.Match(header[0], "MA_CASH").Success) { return(i); } i++; } return(0); }
/// <summary> /// Retorna os campos do PDF por posição faz um retangulo /// </summary> /// <param name="CaminhoArquivo">Caminho do arqivo a ser lido</param> /// <param name="Posicoes">Array com 3 posições</param> /// <param name="Pagina">Número da página aonde se encontra os dados do cliente</param> /// <returns>Retorna o valor encontrado</returns> private string RetornarValor(String CaminhoArquivo, string[] Posicoes, int Pagina) { string retorno = string.Empty; using (PdfReader pdfReader = new PdfReader(CaminhoArquivo)) { RectangleJ rect = new RectangleJ(float.Parse(Posicoes[0]), float.Parse(Posicoes[1]), float.Parse(Posicoes[2]), float.Parse(Posicoes[3])); RenderFilter[] renderFilter = { new RegionTextRenderFilter(rect) }; ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter); retorno = PdfTextExtractor.GetTextFromPage(pdfReader, Pagina, textExtractionStrategy); rect = null; renderFilter = null; textExtractionStrategy = null; pdfReader.Close(); pdfReader.Dispose(); } return(retorno == string.Empty ? " " : retorno); }
/** * Constructs a filter * @param filterRect the rectangle to filter text against. */ public RegionTextRenderFilter(iTextSharp.text.Rectangle filterRect) { this.filterRect = new RectangleJ(filterRect); }
/** * Constructs a filter * @param filterRect the rectangle to filter text against. Note that this is a java.awt.Rectangle ! */ public RegionTextRenderFilter(RectangleJ filterRect) { this.filterRect = filterRect; }
public List <string[]> GetTableCells(int page, RectangleJ area, float rowHeight, float rowHeightOffset, float[] columnWidth) { return(GetTableCells(page, area, rowHeight, rowHeightOffset, columnWidth, false)); }
public RectangleJ Intersection(RectangleJ r) { float x1 = Math.Max(x, r.x); float y1 = Math.Max(y, r.y); float x2 = Math.Min(x + width, r.x + r.width); float y2 = Math.Min(y + height, r.y + r.height); return new RectangleJ(x1, y1, x2 - x1, y2 - y1); }
public List <string[]> GetTableCells(int page, RectangleJ area, float rowHeight, float[] columnWidth, bool landscape) { return(GetTableCells(page, area, rowHeight, 0f, columnWidth, landscape)); }
public override List <string[]> GetTransactions() { List <string[]> temp = new List <string[]>(); List <string[]> result = new List <string[]>(); float[] transactionsHeader = new float[] { 0.73f }; int rowsToHeader; bool headerFound = false; bool transactionsEnd = false; for (int page = transactionsPageNumber; page <= numberOfPages && !transactionsEnd; page++) { rowsToHeader = 0; //Find where Transactions start if (!headerFound) { rowsToHeader = GetRowsToHeader(page); if (rowsToHeader > 0) { headerFound = true; } } //Get Transactions if (headerFound) { RectangleJ area = GetTransactionsArea(page); area.Height -= rowsToHeader * rowHeight; temp = GetTableCells(page, area, rowHeight, transactionsColumnWidth, true); foreach (string[] row in temp) { Match match; if (Regex.Match(row[1], "Opening Balance").Success || Regex.Match(row[0], "MA_CASH").Success) { continue; } else { match = Regex.Match(row[0], @"(\d{2})\/(\d{2})\/(\d{4})"); if (match.Success) { //Format Date row[0] = match.Groups[3].Value + "-" + match.Groups[2].Value + "-" + match.Groups[1].Value; row[2] = (row[2] == "-") ? "" : row[2].Substring(1); result.Add(row); } else if (row[0] != "" && !Regex.Match(row[0], "Code").Success) { transactionsEnd = true; break; } } } } } return(result); }
public override List <string[]> GetTransactions() { List <string[]> temp = new List <string[]>(); List <string[]> result = new List <string[]>(); Regex regexYearDescription = new Regex("STATEMENT OPENING BALANCE"); Regex regexClosingBalance = new Regex("CLOSING BALANCE"); Regex regexDate = new Regex(@"(\d{2})(\s|-)(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)"); string year = "2015"; Match match; bool closingStatementFound = false; bool openingStatementFound = false; RectangleJ transactionsArea = transactionsFirstPage; for (int page = 1; page <= numberOfPages; page++) { temp = GetTableCells(page, transactionsArea, rowHeight, transactionsColumnWidth); for (int i = 0; i < temp.Count; i++) { RemoveDots(temp[i]); if (regexYearDescription.Match(temp[i][1]).Success) { openingStatementFound = true; transactionsArea = transactionsGeneralPage; year = temp[i][0]; continue; } else if (regexClosingBalance.Match(temp[i][1]).Success) { transactionsArea = transactionsFirstPage; closingStatementFound = true; break; } else if ((match = regexDate.Match(temp[i][0])).Success) { temp[i][0] = match.Groups[1].Value + "-" + match.Groups[3].Value + "-" + year; } else { //If previous cell in Date containted a date if (temp[i][0] == "" && i > 0 && regexDate.Match(temp[i - 1][0]).Success) { temp[i - 1][1] += " " + temp[i][1]; temp[i - 1][2] = temp[i][2]; temp[i - 1][3] = temp[i][3]; } continue; } if (openingStatementFound) { result.Add(temp[i]); } } if (closingStatementFound) { openingStatementFound = closingStatementFound = false; continue; } } return(result); }
/** * Constructs a <CODE>Rectangle</CODE>-object based on a <CODE>com.itextpdf.awt.geom.Rectangle</CODE> object * @param rect com.itextpdf.awt.geom.Rectangle */ public Rectangle(RectangleJ rect) : this(rect.X, rect.Y, rect.X + rect.Width, rect.Y + rect.Height) { }