private List <RectAndText> GetTextAndCoords(string file, string textToBeFound) { List <RectAndText> returnRATs = new List <RectAndText>(); StringBuilder text = new StringBuilder(); ITextExtractionStrategy strategy = new MyLocationTextExtractionStrategy(); using (PdfReader reader = new PdfReader(file)) { for (int page = 1; page <= reader.NumberOfPages; page++) { ((MyLocationTextExtractionStrategy)strategy).Page = page; string currentText = PdfTextExtractor.GetTextFromPage(reader, page, strategy); text.Append(currentText); } } foreach (RectAndText RAT in ((MyLocationTextExtractionStrategy)strategy).myPoints) { if (RAT.Text.Equals(textToBeFound)) { logger.Debug("Text found: " + RAT.Text + " Page: " + RAT.Page + " Coords: " + RAT.Rect.GetTop(0) + ", " + RAT.Rect.GetBottom(0) + ", " + RAT.Rect.GetLeft(0) + ", " + RAT.Rect.GetRight(0)); returnRATs.Add(RAT); } } return(returnRATs); }
private void ReplaceText(string textToBeSearched, string textToAdd, string outputFilePath, string inputFilePath) { try { using (Stream inputPdfStream = new FileStream(inputFilePath, FileMode.Open, FileAccess.Read, FileShare.Read)) using (Stream outputPdfStream = new FileStream(outputFilePath, FileMode.Create, FileAccess.Write, FileShare.ReadWrite)) using (Stream outputPdfStream2 = new FileStream(outputFilePath, FileMode.Create, FileAccess.ReadWrite, FileShare.ReadWrite)) { //Opens the unmodified PDF for reading PdfReader reader = new PdfReader(inputPdfStream); //Creates a stamper to put an image on the original pdf PdfStamper stamper = new PdfStamper(reader, outputPdfStream); //{ FormFlattening = true, FreeTextFlattening = true }; for (var i = 1; i <= reader.NumberOfPages; i++) { var tt = new MyLocationTextExtractionStrategy(textToBeSearched); var ex = PdfTextExtractor.GetTextFromPage(reader, i, tt); // ex will be holding the text, although we just need the rectangles [RectAndText class objects] foreach (var p in tt.myPoints) { //Creates an image that is the size i need to hide the text i'm interested in removing Bitmap transparentBitmap = new Bitmap((int)p.Rect.Width, (int)p.Rect.Height); transparentBitmap.MakeTransparent(); iTextSharp.text.Image image = iTextSharp.text.Image.GetInstance(transparentBitmap, new BaseColor(255, 255, 255)); //Sets the position that the image needs to be placed (ie the location of the text to be removed) image.SetAbsolutePosition(p.Rect.Left, (p.Rect.Top - 15)); //Adds the image to the output pdf stamper.GetOverContent(i).AddImage(image, true); // i stands for the page no. PdfContentByte cb = stamper.GetOverContent(i); // 中文自型設定------------ //int fontStyle = 1; // 設定為粗體 string fontPath = "c:\\windows\\fonts\\KAIU.TTF"; BaseFont bf = BaseFont.CreateFont(fontPath, BaseFont.IDENTITY_H, BaseFont.NOT_EMBEDDED); //---------------------------------- // select the font properties //BaseFont bf = BaseFont.CreateFont(BaseFont.HELVETICA, BaseFont.CP1252, BaseFont.NOT_EMBEDDED); cb.SetColorFill(BaseColor.BLACK); cb.SetFontAndSize(bf, 14); // write the text in the pdf content cb.BeginText(); // put the alignment and coordinates here cb.ShowTextAligned(1, textToAdd, p.Rect.Left + 14, p.Rect.Top - 10, 0); cb.EndText(); } } //Creates the first copy of the outputted pdf stamper.Close(); } } catch (Exception ex) { } }
public void ProcessPdf(StringComparison sc, string sourceFile, string destinationFile, string searchTerm, int excelRowNumber, List <KeyValuePair <int, string> > searchValues) { var sArr = searchTerm.Split(','); myProgressBar.Maximum = searchValues.Count; bool found = false; Cursor = Cursors.WaitCursor; if (File.Exists(sourceFile)) { var pReader = new PdfReader(sourceFile); myProgressBar.Value = 0; PdfStamper stamper = null; foreach (var item in searchValues) { var newStrings = item.Value.Split(','); var foundText = string.Empty; foreach (var search in newStrings) { for (var page = 1; page <= pReader.NumberOfPages; page++) { var t = new MyLocationTextExtractionStrategy(search, CompareOptions.Ordinal); using (var r = new PdfReader(sourceFile)) { var ex = PdfTextExtractor.GetTextFromPage(r, 1, t); } var matchesFound = t.MyPoints; if (t.MyPoints.Count > 0) { found = true; if (!string.IsNullOrEmpty(search)) { foundText += "," + search; } if (!File.Exists(destinationFile)) { stamper = new PdfStamper(pReader, new FileStream(destinationFile, FileMode.Create)); } if (!_fileList.Contains(destinationFile)) { _fileList.Add(destinationFile); } var cb = stamper.GetUnderContent(page); cb.SetColorFill(BaseColor.BLACK); foreach (var rect in matchesFound) { if (rect.Text == search) { cb.Rectangle(rect.Rect.Left, rect.Rect.Bottom, rect.Rect.Width, rect.Rect.Height); float[] quad = { rect.Rect.Left, rect.Rect.Bottom, rect.Rect.Right, rect.Rect.Bottom, rect.Rect.Left, rect.Rect.Top, rect.Rect.Right, rect.Rect.Top }; var highlight = PdfAnnotation.CreateMarkup(stamper.Writer, rect.Rect, Constants.vbNull.ToString(), PdfAnnotation.MARKUP_HIGHLIGHT, quad); highlight.Color = BaseColor.YELLOW; stamper.AddAnnotation(highlight, page); } } } } } if (found && !string.IsNullOrEmpty(foundText)) { if (sourceFile == txtFirstPDF.Text) { _foundValuesInFirstPdf.Add(new KeyValuePair <int, string>(item.Key, foundText.Substring(1))); } else { _foundValuesInSecindPdf.Add(new KeyValuePair <int, string>(item.Key, foundText.Substring(1))); } } myProgressBar.Value = myProgressBar.Value + 1; } if (stamper != null) { stamper.Close(); } } this.Cursor = Cursors.Default; }
public static List <List <string> > getLineText(string path, int page, float[] coord) { //Create an instance of our strategy var t = new MyLocationTextExtractionStrategy(); //Parse page 1 of the document above using (var r = new PdfReader(path)) { for (var i = 0; i < r.NumberOfPages; i++) { //var ex = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(r, 2, t); } // Calling this function adds all the chunks with their coordinates to the // 'myPoints' variable of 'MyLocationTextExtractionStrategy' Class var ex = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(r, page, t); } // List of columns in one line List <string> lineWord = new List <string>(); // temporary list for working around appending the <List<List<string>> List <string> tempWord; // List of rows. rows are list of string List <List <string> > lineText = new List <List <string> >(); // List consisting list of chunks related to each line List <List <RectAndText> > lineChunksList = new List <List <RectAndText> >(); //List consisting the chunks for whole page; List <RectAndText> chunksList; // List consisting the list of Bottom coord of the lines present in the page List <float> bottomPointList = new List <float>(); //Getting List of Coordinates of Lines in the page no matter it's a table or not foreach (var i in t.myPoints) { // If the coords passed to the function is not null then process the part in the // given coords of the page otherwise process the whole page if (coord != null) { if (i.Rect.Left >= coord[0] && i.Rect.Bottom >= coord[1] && i.Rect.Right <= coord[2] && i.Rect.Top <= coord[3]) { float bottom = i.Rect.Bottom; if (bottomPointList.Count == 0) { bottomPointList.Add(bottom); } else if (Math.Abs(bottomPointList.Last() - bottom) > 3) { bottomPointList.Add(bottom); } } } // else process the whole page else { float bottom = i.Rect.Bottom; if (bottomPointList.Count == 0) { bottomPointList.Add(bottom); } else if (Math.Abs(bottomPointList.Last() - bottom) > 3) { bottomPointList.Add(bottom); } } } // Sometimes the above List will be having some elements which are from the same line but are // having different coordinates due to some characters like " ",".",etc. // And these coordinates will be having the difference of at most 4 points between // their bottom coordinates. //so to remove those elements we create two new lists which we need to remove from the original list //This list will be having the elements which are having different but a little difference in coordinates List <float> removeList = new List <float>(); // This list is having the elements which are having the same coordinates List <float> sameList = new List <float>(); // Here we are adding the elements in those two lists to remove the elements // from the original list later for (var i = 0; i < bottomPointList.Count; i++) { var basePoint = bottomPointList[i]; for (var j = i + 1; j < bottomPointList.Count; j++) { var comparePoint = bottomPointList[j]; //here we are getting the elements with same coordinates if (Math.Abs(comparePoint - basePoint) == 0) { sameList.Add(comparePoint); } // here ae are getting the elements which are having different but the diference // of less than 4 points else if (Math.Abs(comparePoint - basePoint) < 4) { removeList.Add(comparePoint); } } } // Here we are removing the matching elements of remove list from the original list bottomPointList = bottomPointList.Where(item => !removeList.Contains(item)).ToList(); //Here we are removing the first matching element of same list from the original list foreach (var r in sameList) { bottomPointList.Remove(r); } // Here we are getting the characters of the same line in a List 'chunkList'. foreach (var bottomPoint in bottomPointList) { chunksList = new List <RectAndText>(); for (int i = 0; i < t.myPoints.Count; i++) { // If the character is having same bottom coord then add it to chunkList if (bottomPoint == t.myPoints[i].Rect.Bottom) { chunksList.Add(t.myPoints[i]); } // If character is having a difference of less than 3 in the bottom coord then also // add it to chunkList because the coord of the next line will differ at least 10 points // from the coord of current line else if (Math.Abs(t.myPoints[i].Rect.Bottom - bottomPoint) < 3) { chunksList.Add(t.myPoints[i]); } } // Here we are adding the chunkList related to each line lineChunksList.Add(chunksList); } bool sameLine = false; //Here we are looping through the lines consisting the chunks related to each line foreach (var linechunk in lineChunksList) { var text = ""; // Here we are looping through the chunks of the specific line to put the texts // that are having a cord jump in their left coordinates. // because only the line having table will be having the coord jumps in their // left coord not the line having texts for (var i = 0; i < linechunk.Count - 1; i++) { // If the coord is having a jump of less than 3 points then it will be in the same // column otherwise the next chunk belongs to different column if (Math.Abs(linechunk[i].Rect.Right - linechunk[i + 1].Rect.Left) < 3) { if (i == linechunk.Count - 2) { text += linechunk[i].Text + linechunk[i + 1].Text; } else { text += linechunk[i].Text; } } else { if (i == linechunk.Count - 2) { // add the text to the column and set the value of next column to "" text += linechunk[i].Text; // this is the list of columns in other word its the row lineWord.Add(text); text = ""; text += linechunk[i + 1].Text; lineWord.Add(text); text = ""; } else { text += linechunk[i].Text; lineWord.Add(text); text = ""; } } } if (text.Trim() != "") { lineWord.Add(text); } // creating a temporary list of strings for the List<List<string>> manipulation tempWord = new List <string>(); tempWord.AddRange(lineWord); // "lineText" is the type of List<List<string>> // this is our list of rows. and rows are List of strings // here we are adding the row to the list of rows lineText.Add(tempWord); lineWord.Clear(); } return(lineText); }