public List <PdfString> RegexSearch(PDFDoc doc, string pattern, bool ifWholeWord, int startPage, int endPage, bool ignoreCase) { List <PdfString> result = new List <PdfString>(); Int32 page_num = 0; string result_str = ""; string ambient_string = ""; Highlights hlts = new Highlights(); Int32 mode = (Int32)(TextSearch.SearchMode.e_reg_expression | TextSearch.SearchMode.e_highlight); if (ifWholeWord) { mode |= (Int32)TextSearch.SearchMode.e_whole_word; } if (ignoreCase) { mode |= (Int32)TextSearch.SearchMode.e_case_sensitive; } int pageCount = doc.GetPageCount(); if (endPage > pageCount) { endPage = pageCount; } TextSearch txt_search = new TextSearch(); txt_search.Begin(doc, pattern, mode, startPage, endPage); while (true) { TextSearch.ResultCode code = txt_search.Run(ref page_num, ref result_str, ref ambient_string, hlts); if (code == TextSearch.ResultCode.e_found) { hlts.Begin(doc); double[] box = null; string temp = result_str; while (hlts.HasNext()) { box = hlts.GetCurrentQuads(); if (box.Length != 8) { hlts.Next(); continue; } result.Add(new PdfString(result_str, new Rect(box[0], box[1], box[4], box[5]), page_num)); hlts.Next(); } } else if (code == TextSearch.ResultCode.e_done) { break; } } return(result); }
/// <summary> /// Get the scale of hightlights text. /// </summary> /// <param name="hightLights">The hightlights information</param> /// <param name="_pdfDoc">The pdf document object</param> /// <returns>The line scale of hightlights text</returns> List <Rect> GetLineRect(Highlights hightLights) { List <Rect> rects = GetHightlightRect(hightLights); hightLights.Begin(_pdfDoc); double[] leftRightTextBounds = GetLeftRightTextBounds(hightLights.GetCurrentPageNumber()); for (int i = 0; i < rects.Count; i++) { rects[i] = new Rect(leftRightTextBounds[0], rects[i].y1, leftRightTextBounds[1], rects[i].y2); } return(rects); }
/// <summary> /// Get the scale of hightlights text. /// </summary> /// <param name="hightLights">The hightlights information</param> /// <param name="_pdfDoc">The pdf document object</param> /// <returns>The scale of hightlights text</returns> List <Rect> GetHightlightRect(Highlights hightLights) { List <Rect> rects = new List <Rect>(); hightLights.Begin(_pdfDoc); int pageNumber = hightLights.GetCurrentPageNumber(); while (hightLights.HasNext()) { Page page = _pdfDoc.GetPage(pageNumber); Matrix2D matrix = page.GetDefaultMatrix(); double[] quads = hightLights.GetCurrentQuads(); for (int i = 0; i < quads.Length; i += 2) { matrix.Mult(ref quads[i], ref quads[i + 1]); } int quad_count = quads.Length / 8; for (int i = 0; i < quad_count; ++i) { //assume each quad is an axis-aligned rectangle int offset = 8 * i; double[] xValues = new double[4] { quads[offset + 0], quads[offset + 2], quads[offset + 4], quads[offset + 6] }; double[] yValues = new double[4] { quads[offset + 1], quads[offset + 3], quads[offset + 5], quads[offset + 7] }; double x1 = xValues.Min(); double x2 = xValues.Max(); double y1 = yValues.Min(); double y2 = yValues.Max(); rects.Add(new Rect(x1, y1, x2, y2)); } hightLights.Next(); } rects.Sort(PositionComparer.CompareRectPos); return(rects); }
static void Main(string[] args) { PDFNet.Initialize(); // Relative path to the folder containing test files. string input_path = "../../TestFiles/"; // Sample code showing how to use high-level text extraction APIs. try { using (PDFDoc doc = new PDFDoc(input_path + "credit card numbers.pdf")) { doc.InitSecurityHandler(); Int32 page_num = 0; String result_str = "", ambient_string = ""; Highlights hlts = new Highlights(); TextSearch txt_search = new TextSearch(); Int32 mode = (Int32)(TextSearch.SearchMode.e_whole_word | TextSearch.SearchMode.e_page_stop | TextSearch.SearchMode.e_highlight); String pattern = "joHn sMiTh"; //call Begin() method to initialize the text search. txt_search.Begin(doc, pattern, mode, -1, -1); int step = 0; //call Run() method iteratively to find all matching instances. while (true) { TextSearch.ResultCode code = txt_search.Run(ref page_num, ref result_str, ref ambient_string, hlts); if (code == TextSearch.ResultCode.e_found) { if (step == 0) { //step 0: found "John Smith" //note that, here, 'ambient_string' and 'hlts' are not written to, //as 'e_ambient_string' and 'e_highlight' are not set. Console.WriteLine(result_str + "'s credit card number is: "); //now switch to using regular expressions to find John's credit card number mode = txt_search.GetMode(); mode |= (Int32)(TextSearch.SearchMode.e_reg_expression | TextSearch.SearchMode.e_highlight); txt_search.SetMode(mode); pattern = "\\d{4}-\\d{4}-\\d{4}-\\d{4}"; //or "(\\d{4}-){3}\\d{4}" txt_search.SetPattern(pattern); ++step; } else if (step == 1) { //step 1: found John's credit card number //result_str.ConvertToAscii(char_buf, 32, true); //cout << " " << char_buf << endl; Console.WriteLine(" " + result_str); //note that, here, 'hlts' is written to, as 'e_highlight' has been set. //output the highlight info of the credit card number hlts.Begin(doc); while (hlts.HasNext()) { Console.WriteLine("The current highlight is from page: " + hlts.GetCurrentPageNumber()); hlts.Next(); } //see if there is an AMEX card number pattern = "\\d{4}-\\d{6}-\\d{5}"; txt_search.SetPattern(pattern); ++step; } else if (step == 2) { //found an AMEX card number Console.WriteLine("\nThere is an AMEX card number:\n " + result_str); //change mode to find the owner of the credit card; supposedly, the owner's //name proceeds the number mode = txt_search.GetMode(); mode |= (Int32)(TextSearch.SearchMode.e_search_up); txt_search.SetMode(mode); pattern = "[A-z]++ [A-z]++"; txt_search.SetPattern(pattern); ++step; } else if (step == 3) { //found the owner's name of the AMEX card Console.WriteLine("Is the owner's name:\n " + result_str + "?"); //add a link annotation based on the location of the found instance hlts.Begin(doc); while (hlts.HasNext()) { Page cur_page = doc.GetPage(hlts.GetCurrentPageNumber()); double[] quads = hlts.GetCurrentQuads(); int quad_count = quads.Length / 8; for (int i = 0; i < quad_count; ++i) { //assume each quad is an axis-aligned rectangle int offset = 8 * i; double x1 = Math.Min(Math.Min(Math.Min(quads[offset + 0], quads[offset + 2]), quads[offset + 4]), quads[offset + 6]); double x2 = Math.Max(Math.Max(Math.Max(quads[offset + 0], quads[offset + 2]), quads[offset + 4]), quads[offset + 6]); double y1 = Math.Min(Math.Min(Math.Min(quads[offset + 1], quads[offset + 3]), quads[offset + 5]), quads[offset + 7]); double y2 = Math.Max(Math.Max(Math.Max(quads[offset + 1], quads[offset + 3]), quads[offset + 5]), quads[offset + 7]); pdftron.PDF.Annots.Link hyper_link = pdftron.PDF.Annots.Link.Create(doc, new Rect(x1, y1, x2, y2), pdftron.PDF.Action.CreateURI(doc, "http://www.pdftron.com")); hyper_link.RefreshAppearance(); cur_page.AnnotPushBack(hyper_link); } hlts.Next(); } string output_path = "../../TestFiles/Output/"; doc.Save(output_path + "credit card numbers_linked.pdf", SDFDoc.SaveOptions.e_linearized); break; } } else if (code == TextSearch.ResultCode.e_page) { //you can update your UI here, if needed } else { break; } } } } catch (PDFNetException e) { Console.WriteLine(e.Message); } }