Esempio n. 1
1
        /// <summary>
        /// Reads a PDF file and extracts all text-searchable content from it.
        /// </summary>
        /// <param name="file">The file to extract text from.</param>
        /// <returns>The text from the PDF file</returns>
        public static string ReadFile(string file)
        {
            if (string.IsNullOrEmpty(file)) throw new ArgumentNullException("file", "file cannot be null or empty");
            FileInfo info = new FileInfo(file);
            if (!info.Exists) throw new FileNotFoundException("file must exist");
            if (!info.Extension.Equals(".pdf", StringComparison.OrdinalIgnoreCase)) throw new ArgumentException("File must have a .pdf extension");

            System.Text.StringBuilder builder = new System.Text.StringBuilder();
            iTextSharp.text.pdf.PdfReader reader = null;
            try {
                reader = new iTextSharp.text.pdf.PdfReader(info.FullName);
                for (int i = 1; i <= reader.NumberOfPages; i++) {
                    builder.Append(ExtractTextFromPDFBytes(reader.GetPageContent(i)));
                    builder.Append(' ');
                }
            }
            catch (Exception) {

            }
            finally {
                if (reader != null) reader.Close();
            }

            return builder.ToString();
        }
Esempio n. 2
0
        /// <summary>
        /// Reads a PDF file and extracts all text-searchable content from it.
        /// </summary>
        /// <param name="file">The file to extract text from.</param>
        /// <returns>The text from the PDF file</returns>
        public static string ReadFile(string file)
        {
            if (string.IsNullOrEmpty(file))
            {
                throw new ArgumentNullException("file", "file cannot be null or empty");
            }
            FileInfo info = new FileInfo(file);

            if (!info.Exists)
            {
                throw new FileNotFoundException("file must exist");
            }
            if (!info.Extension.Equals(".pdf", StringComparison.OrdinalIgnoreCase))
            {
                throw new ArgumentException("File must have a .pdf extension");
            }

            System.Text.StringBuilder     builder = new System.Text.StringBuilder();
            iTextSharp.text.pdf.PdfReader reader  = null;
            try {
                reader = new iTextSharp.text.pdf.PdfReader(info.FullName);
                for (int i = 1; i <= reader.NumberOfPages; i++)
                {
                    builder.Append(ExtractTextFromPDFBytes(reader.GetPageContent(i)));
                    builder.Append(' ');
                }
            }
            catch (Exception) {
            }
            finally {
                if (reader != null)
                {
                    reader.Close();
                }
            }

            return(builder.ToString());
        }
        /// <summary>
        /// Uses the GetResponseCore and GetResponseCoreFinalize to 'inherit' the IFilter behaviour
        /// but also extend it with iTextSharp
        /// </summary>
        /// <remarks>
        /// Add iTextSharp to get better 'title'
        /// [v7] fix by [email protected]
        /// </remarks>
        public override bool GetResponse(System.Net.HttpWebResponse webresponse)
        {
            string filename = System.IO.Path.Combine(Preferences.DownloadedTempFilePath, (System.IO.Path.GetFileName(this.Uri.LocalPath)));

            base.GetResponseCore(webresponse, filename);

            // [v7] fix by [email protected]
            iTextSharp.text.pdf.PdfReader pdfReader = new iTextSharp.text.pdf.PdfReader(filename);
            if (null != pdfReader.Info["Title"])
            {   // overwrite the 'filename' with the embedded title
                string pdfTitle = Convert.ToString(pdfReader.Info["Title"]).Trim();
                if (!String.IsNullOrEmpty(pdfTitle))
                {
                    this.Title = pdfTitle;
                }
            }

            // Now, since we've loaded the iTextSharp library, and the EPocalipse IFilter sometimes
            // fails (old Acrobat, installation problem, etc); let's try 'indexing' the PDF with iTextSharp
            // [v7]
            if (String.IsNullOrEmpty(this.All))
            {
                this.All = String.Empty;
                System.Text.StringBuilder sb = new System.Text.StringBuilder();
                // Following code from:
                // http://www.vbforums.com/showthread.php?t=475759
                for (int p = 1; p <= pdfReader.NumberOfPages; p++)
                {
                    byte[] pageBytes = pdfReader.GetPageContent(p);

                    if (null != pageBytes)
                    {
                        iTextSharp.text.pdf.PRTokeniser token = new iTextSharp.text.pdf.PRTokeniser(new iTextSharp.text.pdf.RandomAccessFileOrArray(pageBytes));
                        while (token.NextToken())
                        {
                            iTextSharp.text.pdf.PRTokeniser.TokType tknType = token.TokenType;
                            string tknValue = token.StringValue;

                            if (tknType == iTextSharp.text.pdf.PRTokeniser.TokType.STRING)
                            {
                                sb.Append(token.StringValue);
                            }
                            else if (tknType == iTextSharp.text.pdf.PRTokeniser.TokType.NUMBER && tknValue == "-600")
                            {
                                sb.Append(" ");
                            }
                            else if (tknType == iTextSharp.text.pdf.PRTokeniser.TokType.OTHER && tknValue == "TJ")
                            {
                                sb.Append(" ");
                            }
                        }
                    }
                }
                this.All += sb.ToString().Replace('\0', ' ');
            }
            pdfReader.Close();

            base.GetResponseCoreFinalize(filename);

            if (!String.IsNullOrEmpty(this.All))
            {
                this.Description = base.GetDescriptionFromWordsOnly(WordsOnly);
                return(true);
            }
            else
            {
                return(false);
            }
        }
Esempio n. 4
0
        /// <summary>
        /// Uses the GetResponseCore and GetResponseCoreFinalize to 'inherit' the IFilter behaviour
        /// but also extend it with iTextSharp
        /// </summary>
        /// <remarks>
        /// Add iTextSharp to get better 'title'
        /// [v7] fix by [email protected]	
        /// </remarks>
        public override bool GetResponse(System.Net.HttpWebResponse webresponse)
        {
            string filename = System.IO.Path.Combine(Preferences.DownloadedTempFilePath, (System.IO.Path.GetFileName(this.Uri.LocalPath)));

            base.GetResponseCore(webresponse, filename);

            // [v7] fix by [email protected]
            iTextSharp.text.pdf.PdfReader pdfReader = new iTextSharp.text.pdf.PdfReader(filename);
            if (null != pdfReader.Info["Title"])
            {   // overwrite the 'filename' with the embedded title
                string pdfTitle = Convert.ToString(pdfReader.Info["Title"]).Trim();
                if (!String.IsNullOrEmpty(pdfTitle))
                {
                    this.Title = pdfTitle;
                }
            }

            // Now, since we've loaded the iTextSharp library, and the EPocalipse IFilter sometimes
            // fails (old Acrobat, installation problem, etc); let's try 'indexing' the PDF with iTextSharp
            // [v7]
            if (String.IsNullOrEmpty(this.All))
            {
                this.All = String.Empty;
                System.Text.StringBuilder sb = new System.Text.StringBuilder();
                // Following code from:
                // http://www.vbforums.com/showthread.php?t=475759
                for (int p = 1; p <= pdfReader.NumberOfPages; p++)
                {
                    byte[] pageBytes = pdfReader.GetPageContent(p);

                    if (null != pageBytes)
                    {
                        iTextSharp.text.pdf.PRTokeniser token = new iTextSharp.text.pdf.PRTokeniser(new iTextSharp.text.pdf.RandomAccessFileOrArray(pageBytes));
                        while (token.NextToken())
                        {
                            iTextSharp.text.pdf.PRTokeniser.TokType tknType = token.TokenType;
                            string tknValue = token.StringValue;

                            if (tknType == iTextSharp.text.pdf.PRTokeniser.TokType.STRING)
                            {
                                sb.Append(token.StringValue);
                            }
                            else if (tknType == iTextSharp.text.pdf.PRTokeniser.TokType.NUMBER && tknValue == "-600")
                            {
                                sb.Append(" ");
                            }
                            else if (tknType == iTextSharp.text.pdf.PRTokeniser.TokType.OTHER && tknValue == "TJ")
                            {
                                sb.Append(" ");
                            }
                        }
                    }
                }
                this.All += sb.ToString().Replace('\0', ' ');
            }
            pdfReader.Close();

            base.GetResponseCoreFinalize(filename);

            if (!String.IsNullOrEmpty(this.All))
            {
                this.Description = base.GetDescriptionFromWordsOnly(WordsOnly);
                return true;
            }
            else
            {
                return false;
            }
        }