public ActionResult SearchText(IFormCollection collection) { // get the PDF file string pdfFile = m_hostingEnvironment.ContentRootPath + @"\wwwroot" + @"\DemoFiles\Pdf\InputPdf.pdf"; // get the text to search string textToSearch = collection["textBoxTextToSearch"]; // create the PDF text extractor PdfTextExtract pdfTextExtract = new PdfTextExtract(); // set a demo serial number pdfTextExtract.SerialNumber = "YCgJMTAE-BiwJAhIB-EhlWTlBA-UEBRQFBA-U1FOUVJO-WVlZWQ=="; int fromPdfPageNumber = int.Parse(collection["textBoxFromPage"]); int toPdfPageNumber = collection["textBoxToPage"][0].Length > 0 ? int.Parse(collection["textBoxToPage"]) : 0; // search the text in PDF document PdfTextSearchItem[] searchTextInstances = pdfTextExtract.SearchText(pdfFile, textToSearch, fromPdfPageNumber, toPdfPageNumber, collection["checkBoxMatchCase"].Count > 0, collection["checkBoxMatchWholeWord"].Count > 0); // load the PDF file to highlight the searched text PdfDocument pdfDocument = PdfDocument.FromFile(pdfFile); // set a demo serial number pdfDocument.SerialNumber = "YCgJMTAE-BiwJAhIB-EhlWTlBA-UEBRQFBA-U1FOUVJO-WVlZWQ=="; // highlight the searched text in PDF document foreach (PdfTextSearchItem searchTextInstance in searchTextInstances) { PdfRectangle pdfRectangle = new PdfRectangle(searchTextInstance.BoundingRectangle); // set rectangle color and opacity pdfRectangle.BackColor = Color.Yellow; pdfRectangle.Opacity = 30; // highlight the text pdfDocument.Pages[searchTextInstance.PdfPageNumber - 1].Layout(pdfRectangle); } // write the modified PDF document try { // write the PDF document to a memory buffer byte[] pdfBuffer = pdfDocument.WriteToMemory(); FileResult fileResult = new FileContentResult(pdfBuffer, "application/pdf"); fileResult.FileDownloadName = "SearchText.pdf"; return(fileResult); } finally { pdfDocument.Close(); } }
protected void buttonExtractText_Click(object sender, EventArgs e) { // get the PDF file string pdfFile = Server.MapPath("~") + @"\DemoFiles\Pdf\InputPdf.pdf"; // create the PDF text extractor PdfTextExtract pdfTextExtract = new PdfTextExtract(); // set a demo serial number pdfTextExtract.SerialNumber = "YCgJMTAE-BiwJAhIB-EhlWTlBA-UEBRQFBA-U1FOUVJO-WVlZWQ=="; // set the text extraction mode pdfTextExtract.TextExtractMode = GetTextExtractMode(); int fromPdfPageNumber = int.Parse(textBoxFromPage.Text); int toPdfPageNumber = textBoxToPage.Text.Length > 0 ? int.Parse(textBoxToPage.Text) : 0; // extract the text from a range of pages of the PDF document string text = pdfTextExtract.ExtractText(pdfFile, fromPdfPageNumber, toPdfPageNumber); // get UTF-8 bytes byte[] utf8Bytes = Encoding.UTF8.GetBytes(text); // the UTF-8 marker byte[] utf8Marker = new byte[] { 0xEF, 0xBB, 0xBF }; // the text document bytes with UTF-8 marker followed by UTF-8 bytes byte[] bytes = new byte[utf8Bytes.Length + utf8Marker.Length]; Array.Copy(utf8Marker, 0, bytes, 0, utf8Marker.Length); Array.Copy(utf8Bytes, 0, bytes, utf8Marker.Length, utf8Bytes.Length); // inform the browser about the data format HttpContext.Current.Response.AddHeader("Content-Type", "text/plain; charset=UTF-8"); // let the browser know how to open the text document and the text document name HttpContext.Current.Response.AddHeader("Content-Disposition", String.Format("{0}; filename=ExtractedText.txt; size={1}", "attachment", bytes.Length.ToString())); // write the text buffer to HTTP response HttpContext.Current.Response.BinaryWrite(bytes); // call End() method of HTTP response to stop ASP.NET page processing HttpContext.Current.Response.End(); }
public ActionResult ExtractText(IFormCollection collection) { m_formCollection = collection; // get the PDF file string pdfFile = m_hostingEnvironment.ContentRootPath + @"\wwwroot" + @"\DemoFiles\Pdf\InputPdf.pdf"; // create the PDF text extractor PdfTextExtract pdfTextExtract = new PdfTextExtract(); // set a demo serial number pdfTextExtract.SerialNumber = "YCgJMTAE-BiwJAhIB-EhlWTlBA-UEBRQFBA-U1FOUVJO-WVlZWQ=="; // set the text extraction mode pdfTextExtract.TextExtractMode = GetTextExtractMode(); int fromPdfPageNumber = int.Parse(collection["textBoxFromPage"]); int toPdfPageNumber = collection["textBoxToPage"][0].Length > 0 ? int.Parse(collection["textBoxToPage"]) : 0; // extract the text from a range of pages of the PDF document string text = pdfTextExtract.ExtractText(pdfFile, fromPdfPageNumber, toPdfPageNumber); // get UTF-8 bytes byte[] utf8Bytes = Encoding.UTF8.GetBytes(text); // the UTF-8 marker byte[] utf8Marker = new byte[] { 0xEF, 0xBB, 0xBF }; // the text document bytes with UTF-8 marker followed by UTF-8 bytes byte[] bytes = new byte[utf8Bytes.Length + utf8Marker.Length]; Array.Copy(utf8Marker, 0, bytes, 0, utf8Marker.Length); Array.Copy(utf8Bytes, 0, bytes, utf8Marker.Length, utf8Bytes.Length); FileResult fileResult = new FileContentResult(bytes, "text/plain; charset=UTF-8"); fileResult.FileDownloadName = "ExtractedText.txt"; return(fileResult); }
protected void buttonSearchText_Click(object sender, EventArgs e) { // get the PDF file string pdfFile = Server.MapPath("~") + @"\DemoFiles\Pdf\InputPdf.pdf"; // get the text to search string textToSearch = textBoxTextToSearch.Text; // create the PDF text extractor PdfTextExtract pdfTextExtract = new PdfTextExtract(); // set a demo serial number pdfTextExtract.SerialNumber = "YCgJMTAE-BiwJAhIB-EhlWTlBA-UEBRQFBA-U1FOUVJO-WVlZWQ=="; int fromPdfPageNumber = int.Parse(textBoxFromPage.Text); int toPdfPageNumber = textBoxToPage.Text.Length > 0 ? int.Parse(textBoxToPage.Text) : 0; // search the text in PDF document PdfTextSearchItem[] searchTextInstances = pdfTextExtract.SearchText(pdfFile, textToSearch, fromPdfPageNumber, toPdfPageNumber, checkBoxMatchCase.Checked, checkBoxMatchWholeWord.Checked); // load the PDF file to highlight the searched text PdfDocument pdfDocument = PdfDocument.FromFile(pdfFile); // set a demo serial number pdfDocument.SerialNumber = "YCgJMTAE-BiwJAhIB-EhlWTlBA-UEBRQFBA-U1FOUVJO-WVlZWQ=="; // highlight the searched text in PDF document foreach (PdfTextSearchItem searchTextInstance in searchTextInstances) { PdfRectangle pdfRectangle = new PdfRectangle(searchTextInstance.BoundingRectangle); // set rectangle color and opacity pdfRectangle.BackColor = Color.Yellow; pdfRectangle.Opacity = 30; // highlight the text pdfDocument.Pages[searchTextInstance.PdfPageNumber - 1].Layout(pdfRectangle); } // write the modified PDF document try { // write the PDF document to a memory buffer byte[] pdfBuffer = pdfDocument.WriteToMemory(); // inform the browser about the binary data format HttpContext.Current.Response.AddHeader("Content-Type", "application/pdf"); // let the browser know how to open the PDF document and the file name HttpContext.Current.Response.AddHeader("Content-Disposition", String.Format("attachment; filename=SearchText.pdf; size={0}", pdfBuffer.Length.ToString())); // write the PDF buffer to HTTP response HttpContext.Current.Response.BinaryWrite(pdfBuffer); // call End() method of HTTP response to stop ASP.NET page processing HttpContext.Current.Response.End(); } finally { pdfDocument.Close(); } }
public MainWindow() { InitializeComponent(); // Updator.Download(); // string path = ("2012JUN_Q.pdf"); /* foreach (var path in Directory.EnumerateFiles(AppContext.BaseDirectory, "*.pdf")) * { * if (path.Contains("MS")) continue; * var t = PdfTextExtract.dumpQuestions(path); * Dispatcher.BeginInvoke(new Action(() => * { * tb.Text = t; * })); * * File.WriteAllText(path + ".csv", t); * }*/ File.WriteAllText("January 2009 MS - Unit 1 Edexcel Physics A-level" + ".txt", PdfTextExtract.pdfText("January 2009 MS - Unit 1 Edexcel Physics A-level.pdf")); // string path = ("January 2009 QP - Unit 1 Edexcel Physics A-level.pdf"); // var t = PdfTextExtract.dumpQuestions(path); // tb.Text= t; // File.WriteAllText(path + ".csv", t); }
public static string dumpQuestions(string path) { PdfTextExtract.dumpMarkScheme(path.Replace("QP", "MS")); var list = dataMS; PdfReader reader = new PdfReader(path); StringBuilder text = new StringBuilder(); string t = ""; int questionNo = 0; int tempNo = 0; int pages = 3; Debug.Print(path.Substring(0, 4)); text.AppendLine("questionNo,questionAqua,questionSmall,question,markScheme,score,Topics"); while (t.IndexOf("section", StringComparison.CurrentCultureIgnoreCase) < 0 || t.IndexOf("answer all", StringComparison.CurrentCultureIgnoreCase) < 0) { if (pages >= 20) { return(""); } t = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, pages); pages++; } for (int page = pages - 1; page <= reader.NumberOfPages; page++) { t = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, page); if (t.Contains("formulae") || t.Contains("BLANK PAGE")) { continue; } t = Regex.Replace(t, @"(\\n\*.*\*\\n)", @""); string temp = t.Substring(0, 30); try { //www\.dynamicpapers\.com\\n(1[0-9]|2[0-9]|24) // var r = Regex.Match(temp, @"((1[0-9]|2[0-9]|24))(?s).*\)"); var r = Regex.Match(temp, @"(1[0-9]|2[0-9]|24)"); questionNo = int.Parse(r.Groups[1].Value.ToString()); // System.Diagnostics.Debug.Print(questionNo.ToString()); } catch { if (t.IndexOf("section", StringComparison.CurrentCultureIgnoreCase) < 0 || t.IndexOf("answer all", StringComparison.CurrentCultureIgnoreCase) < 0) { if (!temp.Contains("(i)") || !temp.Contains("ii") || !temp.Contains("(b)") || !temp.Contains("(c)") || !temp.Contains("(d)")) { try { var r = Regex.Match(temp, @"((1[0-9]|2[0-9]|24))(?s).*[\\n\*.*\*\\n]"); questionNo = int.Parse(r.Groups[1].Value.ToString()); if (questionNo - tempNo >= 2 && tempNo != 0) { questionNo = ++tempNo; } } catch { questionNo = tempNo; } } else { questionNo = tempNo; } } else { var r = Regex.Match(t, @"((1[0-9]|2[0-9]|24))(?s).*[\\n\*.*\*\\n]"); questionNo = int.Parse(r.Groups[1].Value.ToString()); } } // if (tempNo > questionNo && questionNo-tempNo <= 2) // { // questionNo = tempNo; // } tempNo = questionNo; foreach (var q in getData(questionNo, t)) { text.AppendLine(q.ToString()); } } reader.Close(); return(text.ToString()); }