Esempio n. 1
0
        public ActionResult SearchText(IFormCollection collection)
        {
            // get the PDF file
            string pdfFile = m_hostingEnvironment.ContentRootPath + @"\wwwroot" + @"\DemoFiles\Pdf\InputPdf.pdf";

            // get the text to search
            string textToSearch = collection["textBoxTextToSearch"];

            // create the PDF text extractor
            PdfTextExtract pdfTextExtract = new PdfTextExtract();

            // set a demo serial number
            pdfTextExtract.SerialNumber = "YCgJMTAE-BiwJAhIB-EhlWTlBA-UEBRQFBA-U1FOUVJO-WVlZWQ==";

            int fromPdfPageNumber = int.Parse(collection["textBoxFromPage"]);
            int toPdfPageNumber   = collection["textBoxToPage"][0].Length > 0 ? int.Parse(collection["textBoxToPage"]) : 0;

            // search the text in PDF document
            PdfTextSearchItem[] searchTextInstances = pdfTextExtract.SearchText(pdfFile, textToSearch,
                                                                                fromPdfPageNumber, toPdfPageNumber, collection["checkBoxMatchCase"].Count > 0, collection["checkBoxMatchWholeWord"].Count > 0);

            // load the PDF file to highlight the searched text
            PdfDocument pdfDocument = PdfDocument.FromFile(pdfFile);

            // set a demo serial number
            pdfDocument.SerialNumber = "YCgJMTAE-BiwJAhIB-EhlWTlBA-UEBRQFBA-U1FOUVJO-WVlZWQ==";

            // highlight the searched text in PDF document
            foreach (PdfTextSearchItem searchTextInstance in searchTextInstances)
            {
                PdfRectangle pdfRectangle = new PdfRectangle(searchTextInstance.BoundingRectangle);

                // set rectangle color and opacity
                pdfRectangle.BackColor = Color.Yellow;
                pdfRectangle.Opacity   = 30;

                // highlight the text
                pdfDocument.Pages[searchTextInstance.PdfPageNumber - 1].Layout(pdfRectangle);
            }

            // write the modified PDF document
            try
            {
                // write the PDF document to a memory buffer
                byte[] pdfBuffer = pdfDocument.WriteToMemory();

                FileResult fileResult = new FileContentResult(pdfBuffer, "application/pdf");
                fileResult.FileDownloadName = "SearchText.pdf";

                return(fileResult);
            }
            finally
            {
                pdfDocument.Close();
            }
        }
Esempio n. 2
0
        protected void buttonExtractText_Click(object sender, EventArgs e)
        {
            // get the PDF file
            string pdfFile = Server.MapPath("~") + @"\DemoFiles\Pdf\InputPdf.pdf";

            // create the PDF text extractor
            PdfTextExtract pdfTextExtract = new PdfTextExtract();

            // set a demo serial number
            pdfTextExtract.SerialNumber = "YCgJMTAE-BiwJAhIB-EhlWTlBA-UEBRQFBA-U1FOUVJO-WVlZWQ==";

            // set the text extraction mode
            pdfTextExtract.TextExtractMode = GetTextExtractMode();

            int fromPdfPageNumber = int.Parse(textBoxFromPage.Text);
            int toPdfPageNumber   = textBoxToPage.Text.Length > 0 ? int.Parse(textBoxToPage.Text) : 0;

            // extract the text from a range of pages of the PDF document
            string text = pdfTextExtract.ExtractText(pdfFile, fromPdfPageNumber, toPdfPageNumber);

            // get UTF-8 bytes
            byte[] utf8Bytes = Encoding.UTF8.GetBytes(text);

            // the UTF-8 marker
            byte[] utf8Marker = new byte[] { 0xEF, 0xBB, 0xBF };

            // the text document bytes with UTF-8 marker followed by UTF-8 bytes
            byte[] bytes = new byte[utf8Bytes.Length + utf8Marker.Length];
            Array.Copy(utf8Marker, 0, bytes, 0, utf8Marker.Length);
            Array.Copy(utf8Bytes, 0, bytes, utf8Marker.Length, utf8Bytes.Length);

            // inform the browser about the data format
            HttpContext.Current.Response.AddHeader("Content-Type", "text/plain; charset=UTF-8");

            // let the browser know how to open the text document and the text document name
            HttpContext.Current.Response.AddHeader("Content-Disposition",
                                                   String.Format("{0}; filename=ExtractedText.txt; size={1}", "attachment", bytes.Length.ToString()));

            // write the text buffer to HTTP response
            HttpContext.Current.Response.BinaryWrite(bytes);

            // call End() method of HTTP response to stop ASP.NET page processing
            HttpContext.Current.Response.End();
        }
Esempio n. 3
0
        public ActionResult ExtractText(IFormCollection collection)
        {
            m_formCollection = collection;

            // get the PDF file
            string pdfFile = m_hostingEnvironment.ContentRootPath + @"\wwwroot" + @"\DemoFiles\Pdf\InputPdf.pdf";

            // create the PDF text extractor
            PdfTextExtract pdfTextExtract = new PdfTextExtract();

            // set a demo serial number
            pdfTextExtract.SerialNumber = "YCgJMTAE-BiwJAhIB-EhlWTlBA-UEBRQFBA-U1FOUVJO-WVlZWQ==";

            // set the text extraction mode
            pdfTextExtract.TextExtractMode = GetTextExtractMode();

            int fromPdfPageNumber = int.Parse(collection["textBoxFromPage"]);
            int toPdfPageNumber   = collection["textBoxToPage"][0].Length > 0 ? int.Parse(collection["textBoxToPage"]) : 0;

            // extract the text from a range of pages of the PDF document
            string text = pdfTextExtract.ExtractText(pdfFile, fromPdfPageNumber, toPdfPageNumber);

            // get UTF-8 bytes
            byte[] utf8Bytes = Encoding.UTF8.GetBytes(text);

            // the UTF-8 marker
            byte[] utf8Marker = new byte[] { 0xEF, 0xBB, 0xBF };

            // the text document bytes with UTF-8 marker followed by UTF-8 bytes
            byte[] bytes = new byte[utf8Bytes.Length + utf8Marker.Length];
            Array.Copy(utf8Marker, 0, bytes, 0, utf8Marker.Length);
            Array.Copy(utf8Bytes, 0, bytes, utf8Marker.Length, utf8Bytes.Length);

            FileResult fileResult = new FileContentResult(bytes, "text/plain; charset=UTF-8");

            fileResult.FileDownloadName = "ExtractedText.txt";

            return(fileResult);
        }
        protected void buttonSearchText_Click(object sender, EventArgs e)
        {
            // get the PDF file
            string pdfFile = Server.MapPath("~") + @"\DemoFiles\Pdf\InputPdf.pdf";

            // get the text to search
            string textToSearch = textBoxTextToSearch.Text;

            // create the PDF text extractor
            PdfTextExtract pdfTextExtract = new PdfTextExtract();

            // set a demo serial number
            pdfTextExtract.SerialNumber = "YCgJMTAE-BiwJAhIB-EhlWTlBA-UEBRQFBA-U1FOUVJO-WVlZWQ==";

            int fromPdfPageNumber = int.Parse(textBoxFromPage.Text);
            int toPdfPageNumber   = textBoxToPage.Text.Length > 0 ? int.Parse(textBoxToPage.Text) : 0;

            // search the text in PDF document
            PdfTextSearchItem[] searchTextInstances = pdfTextExtract.SearchText(pdfFile, textToSearch,
                                                                                fromPdfPageNumber, toPdfPageNumber, checkBoxMatchCase.Checked, checkBoxMatchWholeWord.Checked);

            // load the PDF file to highlight the searched text
            PdfDocument pdfDocument = PdfDocument.FromFile(pdfFile);

            // set a demo serial number
            pdfDocument.SerialNumber = "YCgJMTAE-BiwJAhIB-EhlWTlBA-UEBRQFBA-U1FOUVJO-WVlZWQ==";

            // highlight the searched text in PDF document
            foreach (PdfTextSearchItem searchTextInstance in searchTextInstances)
            {
                PdfRectangle pdfRectangle = new PdfRectangle(searchTextInstance.BoundingRectangle);

                // set rectangle color and opacity
                pdfRectangle.BackColor = Color.Yellow;
                pdfRectangle.Opacity   = 30;

                // highlight the text
                pdfDocument.Pages[searchTextInstance.PdfPageNumber - 1].Layout(pdfRectangle);
            }

            // write the modified PDF document
            try
            {
                // write the PDF document to a memory buffer
                byte[] pdfBuffer = pdfDocument.WriteToMemory();

                // inform the browser about the binary data format
                HttpContext.Current.Response.AddHeader("Content-Type", "application/pdf");

                // let the browser know how to open the PDF document and the file name
                HttpContext.Current.Response.AddHeader("Content-Disposition", String.Format("attachment; filename=SearchText.pdf; size={0}",
                                                                                            pdfBuffer.Length.ToString()));

                // write the PDF buffer to HTTP response
                HttpContext.Current.Response.BinaryWrite(pdfBuffer);

                // call End() method of HTTP response to stop ASP.NET page processing
                HttpContext.Current.Response.End();
            }
            finally
            {
                pdfDocument.Close();
            }
        }
        public MainWindow()
        {
            InitializeComponent();
            //   Updator.Download();
            //  string path  =  ("2012JUN_Q.pdf");

            /*   foreach (var path in Directory.EnumerateFiles(AppContext.BaseDirectory, "*.pdf"))
             * {
             *     if (path.Contains("MS")) continue;
             *     var t = PdfTextExtract.dumpQuestions(path);
             *     Dispatcher.BeginInvoke(new Action(() =>
             *     {
             *         tb.Text = t;
             *     }));
             *
             *     File.WriteAllText(path + ".csv", t);
             * }*/

            File.WriteAllText("January 2009 MS - Unit 1 Edexcel Physics A-level" + ".txt", PdfTextExtract.pdfText("January 2009 MS - Unit 1 Edexcel Physics A-level.pdf"));
            //     string path = ("January 2009 QP - Unit 1 Edexcel Physics A-level.pdf");
            //     var t = PdfTextExtract.dumpQuestions(path);

            //     tb.Text= t;
            //     File.WriteAllText(path + ".csv", t);
        }
    public static string dumpQuestions(string path)
    {
        PdfTextExtract.dumpMarkScheme(path.Replace("QP", "MS"));
        var           list       = dataMS;
        PdfReader     reader     = new PdfReader(path);
        StringBuilder text       = new StringBuilder();
        string        t          = "";
        int           questionNo = 0;
        int           tempNo     = 0;
        int           pages      = 3;

        Debug.Print(path.Substring(0, 4));
        text.AppendLine("questionNo,questionAqua,questionSmall,question,markScheme,score,Topics");
        while (t.IndexOf("section", StringComparison.CurrentCultureIgnoreCase) < 0 || t.IndexOf("answer all", StringComparison.CurrentCultureIgnoreCase) < 0)
        {
            if (pages >= 20)
            {
                return("");
            }
            t = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, pages);
            pages++;
        }

        for (int page = pages - 1; page <= reader.NumberOfPages; page++)
        {
            t = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, page);
            if (t.Contains("formulae") || t.Contains("BLANK PAGE"))
            {
                continue;
            }
            t = Regex.Replace(t, @"(\\n\*.*\*\\n)", @"");

            string temp = t.Substring(0, 30);
            try
            {
                //www\.dynamicpapers\.com\\n(1[0-9]|2[0-9]|24)
                //  var r = Regex.Match(temp, @"((1[0-9]|2[0-9]|24))(?s).*\)");
                var r = Regex.Match(temp, @"(1[0-9]|2[0-9]|24)");
                questionNo = int.Parse(r.Groups[1].Value.ToString());
                //  System.Diagnostics.Debug.Print(questionNo.ToString());
            }
            catch
            {
                if (t.IndexOf("section", StringComparison.CurrentCultureIgnoreCase) < 0 || t.IndexOf("answer all", StringComparison.CurrentCultureIgnoreCase) < 0)
                {
                    if (!temp.Contains("(i)") || !temp.Contains("ii") || !temp.Contains("(b)") || !temp.Contains("(c)") || !temp.Contains("(d)"))
                    {
                        try
                        {
                            var r = Regex.Match(temp, @"((1[0-9]|2[0-9]|24))(?s).*[\\n\*.*\*\\n]");
                            questionNo = int.Parse(r.Groups[1].Value.ToString());
                            if (questionNo - tempNo >= 2 && tempNo != 0)
                            {
                                questionNo = ++tempNo;
                            }
                        }
                        catch
                        {
                            questionNo = tempNo;
                        }
                    }
                    else
                    {
                        questionNo = tempNo;
                    }
                }
                else
                {
                    var r = Regex.Match(t, @"((1[0-9]|2[0-9]|24))(?s).*[\\n\*.*\*\\n]");
                    questionNo = int.Parse(r.Groups[1].Value.ToString());
                }
            }

            //  if (tempNo > questionNo && questionNo-tempNo <= 2)
            //  {
            //      questionNo = tempNo;
            //    }

            tempNo = questionNo;
            foreach (var q in getData(questionNo, t))
            {
                text.AppendLine(q.ToString());
            }
        }
        reader.Close();
        return(text.ToString());
    }