Example #1
0
        private List <RectAndText> GetTextAndCoords(string file, string textToBeFound)
        {
            List <RectAndText> returnRATs = new List <RectAndText>();

            StringBuilder           text     = new StringBuilder();
            ITextExtractionStrategy strategy = new MyLocationTextExtractionStrategy();

            using (PdfReader reader = new PdfReader(file))
            {
                for (int page = 1; page <= reader.NumberOfPages; page++)
                {
                    ((MyLocationTextExtractionStrategy)strategy).Page = page;

                    string currentText = PdfTextExtractor.GetTextFromPage(reader, page, strategy);

                    text.Append(currentText);
                }
            }

            foreach (RectAndText RAT in ((MyLocationTextExtractionStrategy)strategy).myPoints)
            {
                if (RAT.Text.Equals(textToBeFound))
                {
                    logger.Debug("Text found: " + RAT.Text + " Page: " + RAT.Page + " Coords: " + RAT.Rect.GetTop(0) + ", " + RAT.Rect.GetBottom(0) + ", " + RAT.Rect.GetLeft(0) + ", " + RAT.Rect.GetRight(0));

                    returnRATs.Add(RAT);
                }
            }

            return(returnRATs);
        }
Example #2
0
        private void ReplaceText(string textToBeSearched, string textToAdd, string outputFilePath, string inputFilePath)
        {
            try
            {
                using (Stream inputPdfStream = new FileStream(inputFilePath, FileMode.Open, FileAccess.Read, FileShare.Read))
                    using (Stream outputPdfStream = new FileStream(outputFilePath, FileMode.Create, FileAccess.Write, FileShare.ReadWrite))
                        using (Stream outputPdfStream2 = new FileStream(outputFilePath, FileMode.Create, FileAccess.ReadWrite, FileShare.ReadWrite))
                        {
                            //Opens the unmodified PDF for reading
                            PdfReader reader = new PdfReader(inputPdfStream);
                            //Creates a stamper to put an image on the original pdf
                            PdfStamper stamper = new PdfStamper(reader, outputPdfStream); //{ FormFlattening = true, FreeTextFlattening = true };
                            for (var i = 1; i <= reader.NumberOfPages; i++)
                            {
                                var tt = new MyLocationTextExtractionStrategy(textToBeSearched);
                                var ex = PdfTextExtractor.GetTextFromPage(reader, i, tt); // ex will be holding the text, although we just need the rectangles [RectAndText class objects]
                                foreach (var p in tt.myPoints)
                                {
                                    //Creates an image that is the size i need to hide the text i'm interested in removing
                                    Bitmap transparentBitmap = new Bitmap((int)p.Rect.Width, (int)p.Rect.Height);
                                    transparentBitmap.MakeTransparent();
                                    iTextSharp.text.Image image = iTextSharp.text.Image.GetInstance(transparentBitmap, new BaseColor(255, 255, 255));
                                    //Sets the position that the image needs to be placed (ie the location of the text to be removed)
                                    image.SetAbsolutePosition(p.Rect.Left, (p.Rect.Top - 15));
                                    //Adds the image to the output pdf
                                    stamper.GetOverContent(i).AddImage(image, true); // i stands for the page no.

                                    PdfContentByte cb = stamper.GetOverContent(i);

                                    // 中文自型設定------------
                                    //int fontStyle = 1; // 設定為粗體
                                    string   fontPath = "c:\\windows\\fonts\\KAIU.TTF";
                                    BaseFont bf       = BaseFont.CreateFont(fontPath, BaseFont.IDENTITY_H, BaseFont.NOT_EMBEDDED);
                                    //----------------------------------

                                    // select the font properties
                                    //BaseFont bf = BaseFont.CreateFont(BaseFont.HELVETICA, BaseFont.CP1252, BaseFont.NOT_EMBEDDED);
                                    cb.SetColorFill(BaseColor.BLACK);
                                    cb.SetFontAndSize(bf, 14);

                                    // write the text in the pdf content
                                    cb.BeginText();
                                    // put the alignment and coordinates here
                                    cb.ShowTextAligned(1, textToAdd, p.Rect.Left + 14, p.Rect.Top - 10, 0);
                                    cb.EndText();
                                }
                            }
                            //Creates the first copy of the outputted pdf
                            stamper.Close();
                        }
            }
            catch (Exception ex)
            {
            }
        }
Example #3
0
        public void ProcessPdf(StringComparison sc, string sourceFile, string destinationFile, string searchTerm, int excelRowNumber, List <KeyValuePair <int, string> > searchValues)
        {
            var sArr = searchTerm.Split(',');

            myProgressBar.Maximum = searchValues.Count;
            bool found = false;

            Cursor = Cursors.WaitCursor;
            if (File.Exists(sourceFile))
            {
                var pReader = new PdfReader(sourceFile);
                myProgressBar.Value = 0;
                PdfStamper stamper = null;

                foreach (var item in searchValues)
                {
                    var newStrings = item.Value.Split(',');
                    var foundText  = string.Empty;
                    foreach (var search in newStrings)
                    {
                        for (var page = 1; page <= pReader.NumberOfPages; page++)
                        {
                            var t = new MyLocationTextExtractionStrategy(search, CompareOptions.Ordinal);

                            using (var r = new PdfReader(sourceFile))
                            {
                                var ex = PdfTextExtractor.GetTextFromPage(r, 1, t);
                            }

                            var matchesFound = t.MyPoints;

                            if (t.MyPoints.Count > 0)
                            {
                                found = true;
                                if (!string.IsNullOrEmpty(search))
                                {
                                    foundText += "," + search;
                                }


                                if (!File.Exists(destinationFile))
                                {
                                    stamper = new PdfStamper(pReader, new FileStream(destinationFile, FileMode.Create));
                                }

                                if (!_fileList.Contains(destinationFile))
                                {
                                    _fileList.Add(destinationFile);
                                }

                                var cb = stamper.GetUnderContent(page);
                                cb.SetColorFill(BaseColor.BLACK);

                                foreach (var rect in matchesFound)
                                {
                                    if (rect.Text == search)
                                    {
                                        cb.Rectangle(rect.Rect.Left, rect.Rect.Bottom, rect.Rect.Width, rect.Rect.Height);

                                        float[] quad =
                                        {
                                            rect.Rect.Left,
                                            rect.Rect.Bottom,
                                            rect.Rect.Right,
                                            rect.Rect.Bottom,
                                            rect.Rect.Left,
                                            rect.Rect.Top,
                                            rect.Rect.Right,
                                            rect.Rect.Top
                                        };

                                        var highlight = PdfAnnotation.CreateMarkup(stamper.Writer, rect.Rect,
                                                                                   Constants.vbNull.ToString(), PdfAnnotation.MARKUP_HIGHLIGHT, quad);

                                        highlight.Color = BaseColor.YELLOW;

                                        stamper.AddAnnotation(highlight, page);
                                    }
                                }
                            }
                        }
                    }

                    if (found && !string.IsNullOrEmpty(foundText))
                    {
                        if (sourceFile == txtFirstPDF.Text)
                        {
                            _foundValuesInFirstPdf.Add(new KeyValuePair <int, string>(item.Key, foundText.Substring(1)));
                        }
                        else
                        {
                            _foundValuesInSecindPdf.Add(new KeyValuePair <int, string>(item.Key, foundText.Substring(1)));
                        }
                    }

                    myProgressBar.Value = myProgressBar.Value + 1;
                }

                if (stamper != null)
                {
                    stamper.Close();
                }
            }
            this.Cursor = Cursors.Default;
        }
        public static List <List <string> > getLineText(string path, int page, float[] coord)
        {
            //Create an instance of our strategy
            var t = new MyLocationTextExtractionStrategy();

            //Parse page 1 of the document above
            using (var r = new PdfReader(path))
            {
                for (var i = 0; i < r.NumberOfPages; i++)
                {
                    //var ex = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(r, 2, t);
                }
                // Calling this function adds all the chunks with their coordinates to the
                // 'myPoints' variable of 'MyLocationTextExtractionStrategy' Class
                var ex = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(r, page, t);
            }
            // List of columns in one line
            List <string> lineWord = new List <string>();
            // temporary list for working around appending the <List<List<string>>
            List <string> tempWord;
            // List of rows. rows are list of string
            List <List <string> > lineText = new List <List <string> >();
            // List consisting list of chunks related to each line
            List <List <RectAndText> > lineChunksList = new List <List <RectAndText> >();
            //List consisting the chunks for whole page;
            List <RectAndText> chunksList;
            // List consisting the list of Bottom coord of the lines present in the page
            List <float> bottomPointList = new List <float>();

            //Getting List of Coordinates of Lines in the page no matter it's a table or not
            foreach (var i in t.myPoints)
            {
                // If the coords passed to the function is not null then process the part in the
                // given coords of the page otherwise process the whole page
                if (coord != null)
                {
                    if (i.Rect.Left >= coord[0] &&
                        i.Rect.Bottom >= coord[1] &&
                        i.Rect.Right <= coord[2] &&
                        i.Rect.Top <= coord[3])
                    {
                        float bottom = i.Rect.Bottom;
                        if (bottomPointList.Count == 0)
                        {
                            bottomPointList.Add(bottom);
                        }
                        else if (Math.Abs(bottomPointList.Last() - bottom) > 3)
                        {
                            bottomPointList.Add(bottom);
                        }
                    }
                }
                // else process the whole page
                else
                {
                    float bottom = i.Rect.Bottom;
                    if (bottomPointList.Count == 0)
                    {
                        bottomPointList.Add(bottom);
                    }
                    else if (Math.Abs(bottomPointList.Last() - bottom) > 3)
                    {
                        bottomPointList.Add(bottom);
                    }
                }
            }

            // Sometimes the above List will be having some elements which are from the same line but are
            // having different coordinates due to some characters like " ",".",etc.
            // And these coordinates will be having the difference of at most 4 points between
            // their bottom coordinates.

            //so to remove those elements we create two new lists which we need to remove from the original list

            //This list will be having the elements which are having different but a little difference in coordinates
            List <float> removeList = new List <float>();
            // This list is having the elements which are having the same coordinates
            List <float> sameList = new List <float>();

            // Here we are adding the elements in those two lists to remove the elements
            // from the original list later
            for (var i = 0; i < bottomPointList.Count; i++)
            {
                var basePoint = bottomPointList[i];
                for (var j = i + 1; j < bottomPointList.Count; j++)
                {
                    var comparePoint = bottomPointList[j];
                    //here we are getting the elements with same coordinates
                    if (Math.Abs(comparePoint - basePoint) == 0)
                    {
                        sameList.Add(comparePoint);
                    }
                    // here ae are getting the elements which are having different but the diference
                    // of less than 4 points
                    else if (Math.Abs(comparePoint - basePoint) < 4)
                    {
                        removeList.Add(comparePoint);
                    }
                }
            }

            // Here we are removing the matching elements of remove list from the original list
            bottomPointList = bottomPointList.Where(item => !removeList.Contains(item)).ToList();

            //Here we are removing the first matching element of same list from the original list
            foreach (var r in sameList)
            {
                bottomPointList.Remove(r);
            }

            // Here we are getting the characters of the same line in a List 'chunkList'.
            foreach (var bottomPoint in bottomPointList)
            {
                chunksList = new List <RectAndText>();
                for (int i = 0; i < t.myPoints.Count; i++)
                {
                    // If the character is having same bottom coord then add it to chunkList
                    if (bottomPoint == t.myPoints[i].Rect.Bottom)
                    {
                        chunksList.Add(t.myPoints[i]);
                    }
                    // If character is having a difference of less than 3 in the bottom coord then also
                    // add it to chunkList because the coord of the next line will differ at least 10 points
                    // from the coord of current line
                    else if (Math.Abs(t.myPoints[i].Rect.Bottom - bottomPoint) < 3)
                    {
                        chunksList.Add(t.myPoints[i]);
                    }
                }
                // Here we are adding the chunkList related to each line
                lineChunksList.Add(chunksList);
            }
            bool sameLine = false;

            //Here we are looping through the lines consisting the chunks related to each line
            foreach (var linechunk in lineChunksList)
            {
                var text = "";
                // Here we are looping through the chunks of the specific line to put the texts
                // that are having a cord jump in their left coordinates.
                // because only the line having table will be having the coord jumps in their
                // left coord not the line having texts
                for (var i = 0; i < linechunk.Count - 1; i++)
                {
                    // If the coord is having a jump of less than 3 points then it will be in the same
                    // column otherwise the next chunk belongs to different column
                    if (Math.Abs(linechunk[i].Rect.Right - linechunk[i + 1].Rect.Left) < 3)
                    {
                        if (i == linechunk.Count - 2)
                        {
                            text += linechunk[i].Text + linechunk[i + 1].Text;
                        }
                        else
                        {
                            text += linechunk[i].Text;
                        }
                    }
                    else
                    {
                        if (i == linechunk.Count - 2)
                        {
                            // add the text to the column and set the value of next column to ""
                            text += linechunk[i].Text;
                            // this is the list of columns in other word its the row
                            lineWord.Add(text);
                            text  = "";
                            text += linechunk[i + 1].Text;
                            lineWord.Add(text);
                            text = "";
                        }
                        else
                        {
                            text += linechunk[i].Text;
                            lineWord.Add(text);
                            text = "";
                        }
                    }
                }
                if (text.Trim() != "")
                {
                    lineWord.Add(text);
                }
                // creating a temporary list of strings for the List<List<string>> manipulation
                tempWord = new List <string>();
                tempWord.AddRange(lineWord);
                // "lineText" is the type of List<List<string>>
                // this is our list of rows. and rows are List of strings
                // here we are adding the row to the list of rows
                lineText.Add(tempWord);
                lineWord.Clear();
            }

            return(lineText);
        }