GetPageContent() public method

public GetPageContent ( int pageNum ) : byte[]
pageNum int
return byte[]
示例#1
1
 /// <summary>
 /// Removes layers from a PDF document </summary>
 /// <param name="reader">	a PdfReader containing a PDF document </param>
 /// <param name="layers">	a sequence of names of OCG layers </param>
 /// <exception cref="IOException"> </exception>
 public virtual void RemoveLayers(PdfReader reader, params string[] layers)
 {
     int n = reader.NumberOfPages;
     for (int i = 1; i <= n; i++)
         reader.SetPageContent(i, reader.GetPageContent(i));
     ICollection<string> ocgs = new HashSet2<string>();
     for (int i = 0; i < layers.Length; i++)
     {
         ocgs.Add(layers[i]);
     }
     OCGParser parser = new OCGParser(ocgs);
     for (int i = 1; i <= n; i++)
     {
         PdfDictionary page = reader.GetPageN(i);
         Parse(parser, page);
         page.Remove(new PdfName("PieceInfo"));
         RemoveAnnots(page, ocgs);
         RemoveProperties(page, ocgs);
     }
     PdfDictionary root = reader.Catalog;
     PdfDictionary ocproperties = root.GetAsDict(PdfName.OCPROPERTIES);
     if (ocproperties != null) {
         RemoveOCGsFromArray(ocproperties, PdfName.OCGS, ocgs);
         PdfDictionary d = ocproperties.GetAsDict(PdfName.D);
         if (d != null) {
             RemoveOCGsFromArray(d, PdfName.ON, ocgs);
             RemoveOCGsFromArray(d, PdfName.OFF, ocgs);
             RemoveOCGsFromArray(d, PdfName.LOCKED, ocgs);
             RemoveOCGsFromArray(d, PdfName.RBGROUPS, ocgs);
             RemoveOCGsFromArray(d, PdfName.ORDER, ocgs);
             RemoveOCGsFromArray(d, PdfName.AS, ocgs);
         }
     }
     reader.RemoveUnusedObjects();
 }
        /// <summary>
        /// 
        /// </summary>
        /// <param name="inFileName"></param>
        /// <param name="textToFind"></param>
        /// <returns></returns>
        public bool IsTextInPdf(string inFileName, string textToFind)
        {
            try
            {
                // Create a reader for the given PDF file
                using (PdfReader reader = new PdfReader(inFileName)) {

                    //Console.Write("Processing: ");

                    for (int page = 1; page <= reader.NumberOfPages; page++)
                    {
                        string temp = ExtractTextFromPDFBytes(reader.GetPageContent(page));
                        if (temp.IndexOf(textToFind) != -1)
                        {
                            return true;
                        }
                    }
                    return false;
                }
            }
            catch
            {
                return false;
            }
        }
        /// <summary>
        /// Compress a pdf
        /// </summary>
        /// <param name="base64Pdf">A small model to hold a base64 encoded pdf object { "content" : "somebase64" }</param>
        /// <returns>{ "content" : "smallerBase64" }</returns>
        public IHttpActionResult Post(Base64Pdf base64Pdf)
        {
            try
            {
                if (base64Pdf.data == null)
                    return BadRequest("Check supplied pdf model");

                byte[] data = Convert.FromBase64String(base64Pdf.data);

                //Compress
                byte[] compressedData;
                using (var memStream = new MemoryStream())
                {
                    var reader = new PdfReader(data);
                    var stamper = new PdfStamper(reader, memStream, PdfWriter.VERSION_1_4);
                    var pageNum = reader.NumberOfPages;

                    for (var i = 1; i <= pageNum; i++)
                        reader.SetPageContent(i, reader.GetPageContent(i));

                    stamper.SetFullCompression();
                    stamper.Close();
                    reader.Close();

                    compressedData = memStream.ToArray();
                }
                var compressedBase64 = Convert.ToBase64String(compressedData);

                return Json(new Base64Pdf { data = compressedBase64 });
            }
            catch (Exception ex)
            {
                return InternalServerError(ex);
            }
        }
        /// <summary>
        /// Gets the content stream of a page as a PdfStream object.
        /// @since   2.1.3 (the method already existed without param compressionLevel)
        /// </summary>
        /// <param name="pageNumber">the page of which you want the stream</param>
        /// <param name="compressionLevel">the compression level you want to apply to the stream</param>
        /// <returns>a PdfStream object</returns>
        internal PdfStream GetFormXObject(int pageNumber, int compressionLevel)
        {
            var page     = reader.GetPageNRelease(pageNumber);
            var contents = PdfReader.GetPdfObjectRelease(page.Get(PdfName.Contents));
            var dic      = new PdfDictionary();

            byte[] bout = null;
            if (contents != null)
            {
                if (contents.IsStream())
                {
                    dic.Merge((PrStream)contents);
                }
                else
                {
                    bout = reader.GetPageContent(pageNumber, File);
                }
            }
            else
            {
                bout = new byte[0];
            }

            dic.Put(PdfName.Resources, PdfReader.GetPdfObjectRelease(page.Get(PdfName.Resources)));
            dic.Put(PdfName.TYPE, PdfName.Xobject);
            dic.Put(PdfName.Subtype, PdfName.Form);
            var impPage = (PdfImportedPage)ImportedPages[pageNumber];

            dic.Put(PdfName.Bbox, new PdfRectangle(impPage.BoundingBox));
            var matrix = impPage.Matrix;

            if (matrix == null)
            {
                dic.Put(PdfName.Matrix, Identitymatrix);
            }
            else
            {
                dic.Put(PdfName.Matrix, matrix);
            }

            dic.Put(PdfName.Formtype, One);
            PrStream stream;

            if (bout == null)
            {
                stream = new PrStream((PrStream)contents, dic);
            }
            else
            {
                stream = new PrStream(reader, bout);
                stream.Merge(dic);
            }
            return(stream);
        }
        public void TestMultipleDocuments()
        {
            byte[] testFile1 = File.ReadAllBytes(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "test_files\\documents\\document.docx"));
            byte[] testFile2 = File.ReadAllBytes(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "test_files\\documents\\document.docx"));
            Dictionary<string, byte[]> files = new Dictionary<string, byte[]>();
            files.Add("document1.docx", testFile1);
            files.Add("document2.docx", testFile2);
            PdfConverter converter = new PdfConverter();
            byte[] pdf = converter.ConvertFiles(files, new ConversionOptions());

            Assert.IsNotNull(pdf);
            Document doc = new Document();
            PdfReader reader = new PdfReader(pdf);
            int pages = reader.NumberOfPages;
            byte[] page1 = reader.GetPageContent(1);
            byte[] page2 = reader.GetPageContent(2);
            doc.Close();

            Assert.IsTrue(pages == 2);
            Assert.AreEqual(page1.Length, page1.Length);
        }
示例#6
0
        public string ParsePdf(string filePath)
        {
            string text = string.Empty;

            PdfReader reader = new iTextSharp.text.pdf.PdfReader(filePath);

            byte[] streamBytes = reader.GetPageContent(1);

            FileStream fStream = File.OpenRead(filePath);

            byte[] contents = new byte[fStream.Length];

            fStream.Read(contents, 0, (int)fStream.Length);

            fStream.Close();

            string s     = Encoding.UTF8.GetString(contents, 0, contents.Length);
            var    table = (Encoding.Default.GetString(streamBytes, 0, streamBytes.Length - 1)).Split(new string[] { "\r\n", "\r", "\n" }, StringSplitOptions.None);

            byte[]      buf        = Encoding.Convert(Encoding.GetEncoding("iso-8859-1"), Encoding.UTF8, streamBytes);
            string      tempString = Encoding.UTF8.GetString(buf, 0, buf.Count());
            PRTokeniser tokenizer  = new PRTokeniser(streamBytes);

            while (tokenizer.NextToken())
            {
                if (tokenizer.TokenType == PRTokeniser.TK_STRING)
                {
                    text += tokenizer.StringValue;
                }
            }

            // create a reader (constructor overloaded for path to local file or URL)
            //PdfReader reader
            //    = new PdfReader("http://www.chinehamchat.com/Chineham_Chat_Advertisements.pdf");
            // total number of pages
            int n = reader.NumberOfPages;
            // size of the first page
            Rectangle psize = reader.GetPageSize(1);
            //float width = psize.Width;
            //float height = psize.Height;
            //Console.WriteLine("Size of page 1 of {0} => {1} × {2}", n, width, height);
            // file properties
            Hashtable   infoHash = reader.Info;
            ICollection keys     = infoHash.Keys;

            // Dictionary<string, string> infodict = (Dictionary<string,string>)reader.Info;
            foreach (string key in keys)
            {
                text += key + " => " + infoHash[key];
            }
            // Console.WriteLine(key+ " => " + infoHash[key]);
            return(text);
        }
        /**
         * Gets the content stream of a page as a PdfStream object.
         * @param   pageNumber          the page of which you want the stream
         * @param   compressionLevel    the compression level you want to apply to the stream
         * @return  a PdfStream object
         * @since   2.1.3 (the method already existed without param compressionLevel)
         */
        internal PdfStream GetFormXObject(int pageNumber, int compressionLevel)
        {
            PdfDictionary page     = reader.GetPageNRelease(pageNumber);
            PdfObject     contents = PdfReader.GetPdfObjectRelease(page.Get(PdfName.CONTENTS));
            PdfDictionary dic      = new PdfDictionary();

            byte[] bout = null;
            if (contents != null)
            {
                if (contents.IsStream())
                {
                    dic.Merge((PRStream)contents);
                }
                else
                {
                    bout = reader.GetPageContent(pageNumber, file);
                }
            }
            else
            {
                bout = new byte[0];
            }
            dic.Put(PdfName.RESOURCES, PdfReader.GetPdfObjectRelease(page.Get(PdfName.RESOURCES)));
            dic.Put(PdfName.TYPE, PdfName.XOBJECT);
            dic.Put(PdfName.SUBTYPE, PdfName.FORM);
            PdfImportedPage impPage = importedPages[pageNumber];

            dic.Put(PdfName.BBOX, new PdfRectangle(impPage.BoundingBox));
            PdfArray matrix = impPage.Matrix;

            if (matrix == null)
            {
                dic.Put(PdfName.MATRIX, IDENTITYMATRIX);
            }
            else
            {
                dic.Put(PdfName.MATRIX, matrix);
            }
            dic.Put(PdfName.FORMTYPE, ONE);
            PRStream stream;

            if (bout == null)
            {
                stream = new PRStream((PRStream)contents, dic);
            }
            else
            {
                stream = new PRStream(reader, bout);
                stream.Merge(dic);
            }
            return(stream);
        }
示例#8
0
        public void SetPageContentTest01()  {
            String outPdf = DestFolder + "out1.pdf";
            PdfReader reader =
                new PdfReader(TestResourceUtils.GetResourceAsStream(TestResourcesPath, "in.pdf"));
            PdfStamper stamper = new PdfStamper(reader, new FileStream(outPdf, FileMode.Create));
            reader.EliminateSharedStreams();
            int total = reader.NumberOfPages + 1;
            for (int i = 1; i < total; i++) {
                byte[] bb = reader.GetPageContent(i);
                reader.SetPageContent(i, bb);
            }
            stamper.Close();

            Assert.Null(new CompareTool().CompareByContent(outPdf, TestResourceUtils.GetResourceAsTempFile(TestResourcesPath, "cmp_out1.pdf"), DestFolder, "diff_"));
        }
示例#9
0
        public static List<String> Read()
        {
            var pdfReader = new PdfReader(_filePath);
            var pages = new List<String>();

            for (int i = 0; i < pdfReader.NumberOfPages; i++)
            {
                string textFromPage = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, pdfReader.GetPageContent(i + 1)));

                pages.Add(GetDataConvertedData(textFromPage));
                //pages.AddRange(textFromPage.Split(new[] { "\n" }, StringSplitOptions.None)
                //                    .Where(text => text.Contains("Tj")).ToList());
                //pages.Add(textFromPage);
            }

            return pages;
        }
示例#10
0
        /// <summary> Extracts the full text from a PDF file and writes to a file </summary>
        /// <param name="PDF_In_Name">Full path to the pdf file</param>
        /// <param name="Text_Out_Name">Output file name for the extracted text </param>
        /// <returns>TRUE if successful, otherwise FALSE</returns>
        public static bool Extract_Text(string PDF_In_Name, string Text_Out_Name)
        {
            StreamWriter outFile = null;
            PdfReader reader = null;
            try
            {
                // Create a reader for the given PDF file
                reader = new PdfReader(PDF_In_Name);
                //outFile = File.CreateText(outFileName);
                outFile = new StreamWriter(Text_Out_Name, false, Encoding.UTF8);

                for (int page = 1; page <= reader.NumberOfPages; page++)
                {
                    try
                    {
                        string text_to_add = ExtractTextFromPDFBytes(reader.GetPageContent(page));

                        if (text_to_add.Trim().Length > 0)
                        {
                            outFile.WriteLine();
                            outFile.WriteLine("PAGE " + page);
                            outFile.WriteLine();
                            outFile.WriteLine(text_to_add);
                        }
                    }
                    catch
                    {

                    }
                }
                return true;
            }
            catch
            {

            }
            finally
            {
                if (outFile != null) outFile.Close();
                if ( reader != null ) reader.Close();
            }

            return false;
        }
        // ---------------------------------------------------------------------------
        /**
         * Parses object and content information of a PDF into a text file.
         * @param pdf the original PDF
         *
         * this method uses code from;
         * PdfContentReaderTool.ListContentStreamForPage()
         * so i can pass in a byte array instead of file path
         *
         */
        public string InspectPdf(byte[] pdf)
        {
            PdfReader reader = new PdfReader(pdf);
              int maxPageNum = reader.NumberOfPages;
              StringBuilder sb = new StringBuilder();
              for (int pageNum = 1; pageNum <= maxPageNum; pageNum++){
            sb.AppendLine("==============Page " + pageNum + "====================");
            sb.AppendLine("- - - - - Dictionary - - - - - -");
            PdfDictionary pageDictionary = reader.GetPageN(pageNum);
            sb.AppendLine(
              PdfContentReaderTool.GetDictionaryDetail(pageDictionary)
            );

            sb.AppendLine("- - - - - XObject Summary - - - - - -");
            sb.AppendLine(PdfContentReaderTool.GetXObjectDetail(
              pageDictionary.GetAsDict(PdfName.RESOURCES))
            );

            sb.AppendLine("- - - - - Content Stream - - - - - -");
            RandomAccessFileOrArray f = reader.SafeFile;

            byte[] contentBytes = reader.GetPageContent(pageNum, f);
            f.Close();

            foreach (byte b in contentBytes) {
              sb.Append((char)b);
            }

            sb.AppendLine("- - - - - Text Extraction - - - - - -");
            String extractedText = PdfTextExtractor.GetTextFromPage(
              reader, pageNum, new LocationTextExtractionStrategy()
            );
            if (extractedText.Length != 0) {
              sb.AppendLine(extractedText);
            }
            else {
              sb.AppendLine("No text found on page " + pageNum);
            }
            sb.AppendLine();
              }
              return sb.ToString();
        }
示例#12
0
        /// <summary>
        /// Extracts a text from a PDF file.
        /// </summary>
        /// <param name="inFileName">the full path to the pdf file.</param>
        /// <param name="outFileName">the output file name.</param>
        /// <returns>the extracted text</returns>
        public String ExtractText(string inFileName,int topage)
        {
            StreamWriter outFile = null;
            try
            {
                // Create a reader for the given PDF file
                PdfReader reader = new PdfReader(inFileName);

                //outFile = File.CreateText(outFileName);
             //   outFile = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8);
                String outputText ="";

                Console.Write("Processing: ");

                int     totalLen    = 68;
                float   charUnit    = ((float)totalLen) / (float)reader.NumberOfPages;
                int     totalWritten= 0;
                float   curUnit     = 0;

               // for (int page = 1; page <= reader.NumberOfPages; page++)
                for (int page = 1; page <= topage; page++)
                {
                    outputText += ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " ";

                }
                return (outputText =="") ? null : outputText;
            }
            catch
            {
                File.AppendAllText("log_extract.txt", DateTime.Now.ToShortDateString() + " " + DateTime.Now.ToShortTimeString() + ": " + inFileName + Environment.NewLine);
                return null;
            }
            finally
            {
                if (outFile != null) outFile.Close();
            }
        }
示例#13
0
// ---------------------------------------------------------------------------
    /**
     * Manipulates a PDF file src with the file dest as result
     * @param src the original PDF
     */
    public byte[] CompressPdf(byte[] src) {
      PdfReader reader = new PdfReader(src);
      using (MemoryStream ms = new MemoryStream()) {
        using (PdfStamper stamper = 
            new PdfStamper(reader, ms, PdfWriter.VERSION_1_5))
        {
          stamper.Writer.CompressionLevel = 9;
          int total = reader.NumberOfPages + 1;
          for (int i = 1; i < total; i++) {
            reader.SetPageContent(i, reader.GetPageContent(i));
          }
          stamper.SetFullCompression();
        }
        return ms.ToArray();
      }
    }
        /// <summary>
        /// Extracts a text from a PDF file.
        /// </summary>
        /// <param name="inFileName">the full path to the pdf file.</param>
        /// <param name="outFileName">the output file name.</param>
        /// <returns>the extracted text</returns>
        public bool ExtractText(string inFileName, string outFileName)
        {
            StreamWriter outFile = null;
            try
            {
                // Create a reader for the given PDF file
                PdfReader reader = new PdfReader(inFileName);
                //outFile = File.CreateText(outFileName);
                outFile = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8);
                
                Console.Write("Processing: ");
                
                int     totalLen    = 68;
                float   charUnit    = ((float)totalLen) / (float)reader.NumberOfPages;
                int     totalWritten= 0;
                float   curUnit     = 0;

                for (int page = 1; page <= reader.NumberOfPages; page++)
                {                    
                    outFile.Write(ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " ");
                    
                    // Write the progress.
                    if (charUnit >= 1.0f)
                    {
                        for (int i = 0; i < (int)charUnit; i++)
                        {
                            Console.Write("#");
                            totalWritten++;
                        }
                    }
                    else
                    {
                        curUnit += charUnit;
                        if (curUnit >= 1.0f)
                        {
                            for (int i = 0; i < (int)curUnit; i++)
                            {
                                Console.Write("#");
                                totalWritten++;
                            }
                            curUnit = 0;
                        }
                        
                    }
                }

                if (totalWritten < totalLen)
                {
                    for (int i = 0; i < (totalLen - totalWritten); i++)
                    {
                        Console.Write("#");
                    }
                }
                return true;
            }
            catch(Exception ex)
            {
                return false;
            }
            finally
            {
                if (outFile != null) outFile.Close();
            }
        }
示例#15
0
        private bool ArePagesIdentical(String firstPdf, int firstStartPage, int firstLastPage,
            String secondPdf, int secondStartPage)
        {
            bool pagesAreIdentical = true;
            var firstPdfReader = new iTextSharpPDF.PdfReader(new iTextSharpPDF.RandomAccessFileOrArray(firstPdf), null);
            var secondPdfReader = new iTextSharpPDF.PdfReader(new iTextSharpPDF.RandomAccessFileOrArray(secondPdf), null);

            int secondPdfPage = secondStartPage;
            try
            {
                for (int currentFirstPage = firstStartPage; currentFirstPage < firstLastPage; currentFirstPage++)
                {
                    if (BitConverter.ToInt32(new MD5CryptoServiceProvider().ComputeHash(firstPdfReader.GetPageContent(currentFirstPage)), 0)
                        != BitConverter.ToInt32(new MD5CryptoServiceProvider().ComputeHash(secondPdfReader.GetPageContent(secondPdfPage)), 0))
                    {
                        pagesAreIdentical = false;
                        break;
                    }
                    secondPdfPage++;
                }
            }
            finally
            {
                if (firstPdfReader != null) firstPdfReader.Close();
                if (secondPdfReader != null) secondPdfReader.Close();
            }
            return pagesAreIdentical;
        }
示例#16
0
    /// <summary>
    /// Extracts a text from a PDF file.
    /// </summary>
    /// <param name="inFileName">the full path to the pdf file.</param>
    /// <param name="outFileName">the output file name.</param>
    /// <returns>the extracted text</returns>
    private static Boolean GetInvoice(string inFileName, String outputFile, String memberNumber, Boolean notUsed)
    {
        Boolean memberFound = false;
        try
        {
            // Create a reader for the given PDF file
            PdfReader reader = new PdfReader(inFileName);
            //outFile = File.CreateText(outFileName);
            //outFile = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8);

            //Console.Write("Processing: ");

            int totalLen = 68;
            float charUnit = ((float)totalLen) / (float)reader.NumberOfPages;
            int totalWritten = 0;
            float curUnit = 0;

            //ExtractPages(inFileName, @"C:\Users\Nikolaj Sostack\Downloads\PDF\pdf.pdf", 1, 1);

            int pageFound = -1;
            for (int page = 1; page <= reader.NumberOfPages; page++)
            {

                //System.IO.File.WriteAllBytes(@"C:\Users\Nikolaj Sostack\Downloads\PDF\pdf.pdf", reader.GetPageContent(page));

                string txt = ExtractTextFromPDFBytes(reader.GetPageContent(page));

                var lastLine = txt.Split('\r')[1];
                var number = lastLine;

                if( txt.Contains("\n\r" + memberNumber + "\n\r") )
                {
                    if (!number.StartsWith("-"))
                    {
                        if (!String.IsNullOrEmpty(outputFile))
                            pageFound = page;

                        memberFound = true;
                    }
                }

                //// Write the progress.
                //if (charUnit >= 1.0f)
                //{
                //    for (int i = 0; i < (int)charUnit; i++)
                //    {
                //        Console.Write("#");
                //        totalWritten++;
                //    }
                //}
                //else
                //{
                //    curUnit += charUnit;
                //    if (curUnit >= 1.0f)
                //    {
                //        for (int i = 0; i < (int)curUnit; i++)
                //        {
                //            Console.Write("#");
                //            totalWritten++;
                //        }
                //        curUnit = 0;
                //    }

                //}
            }

            if( memberFound && pageFound > -1 )
                ExtractPages(inFileName, outputFile, pageFound, pageFound);

            //if (totalWritten < totalLen)
            //{
            //    for (int i = 0; i < (totalLen - totalWritten); i++)
            //    {
            //        Console.Write("#");
            //    }
            //}
        }
        catch
        {
            throw;
        }
        finally
        {

            //if (outFile != null) outFile.Close();
        }

        return memberFound;
    }
示例#17
0
        private KpiBudget ParseC810(byte[] bytestream)
        {
            try
            {
                PdfReader reader = new PdfReader(bytestream);

                string content = System.Text.Encoding.UTF8.GetString(reader.GetPageContent(1));

                int indexTotal = FindIndex(content, 0, "[(Total)]TJ");
                if (!VerifyIndex(indexTotal, "Fant ikke først element 'Total'"))
                    return null;

                int indexCategory = FindIndex(content, indexTotal + 1, "[(Category)]TJ");
                if (!VerifyIndex(indexCategory, "Fant ikke andre element 'Category'"))
                    return null;

                int indexSale = FindIndex(content, indexCategory + 1, "[(Sale/ Budget incl. VAT - Month to Date )]TJ");
                if (!VerifyIndex(indexSale, "Fant ikke tredje element 'Sale/ Budget incl. VAT - Month to Date'"))
                    return null;

                //int indexTotal = content.IndexOf("[(Total)]TJ");
                //int indexCategory = content.IndexOf("[(Category)]TJ", indexTotal + 1);
                //int indexSale = content.IndexOf("[(Sale/ Budget incl. VAT - Month to Date )]TJ", indexCategory + 1);

                string strBudget = content.Substring(indexCategory, indexSale - indexCategory);

                List<string> lines = strBudget.Split('\n').ToList();

                lines.RemoveAll(item => !item.ToString().StartsWith("[("));

                for (int i = lines.Count - 1; i >= 0; i--)
                    lines[i] = lines[i].Trim().Replace("[(", string.Empty).Replace(")]TJ", string.Empty).Replace(" ", string.Empty).Replace("/", string.Empty);

                var budget = new KpiBudget();
                budget.Date = selectedDate;

                for (int i = 0; i < lines.Count; i++)
                {
                    if (lines[i].Contains("MDA") || lines[i].Contains("AudioVideo") || lines[i].Contains("SDA") || lines[i].Contains("Telecom")
                        || lines[i].Contains("Computing") || lines[i].Contains("Kitchen") || lines[i].Contains("Other") || lines[i].Contains("Total"))
                    {
                        string type = lines[i];
                        i++;
                        decimal decSales = 0;
                        decimal.TryParse(lines[i], out decSales);
                        i++;
                        decimal decGM = 0;
                        decimal.TryParse(lines[i], out decGM);
                        var element = new KpiBudgetElement();
                        element.Insert(type, decSales, decGM);
                        budget.element.Add(element);
                    }
                }

                return budget;
            }
            catch (Exception ex)
            {
                Log.Unhandled(ex);
            }
            return null;
        }
示例#18
0
        internal string ExtractText(string inFileName)
        {
            PdfReader reader = new PdfReader(inFileName);
            string Results = string.Empty;

            try
            {
                int totalLen = 68;
                float charUnit = ((float)totalLen) / (float)reader.NumberOfPages;
                for (int page = 1; page <= reader.NumberOfPages; page++)
                {
                    Results = Results + ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " ";
                }
            }
            catch (Exception m)
            {
                MyException mobj = new MyException("ExtractText() : " + m.Message);
            }

            return Results;
        }
示例#19
0
// ---------------------------------------------------------------------------    
    /**
     * Reads the content stream of the first page of a PDF into a text file.
     * @param src the PDF file
     */
    public string ReadContent(byte[] src) {
      PdfReader reader = new PdfReader(src);
      byte[] pc = reader.GetPageContent(1);
      return Encoding.UTF8.GetString(pc, 0, pc.Length);
    }
示例#20
0
        private void OpenPdf()
        {
            _pdfPages.Clear();
            try
            {
                var openFileDialog = new OpenFileDialog
                                         {
                                             DefaultExt = ".pdf",
                                             Filter = "Pdf documents (.pdf)|*.pdf"
                                         };

                bool? result = openFileDialog.ShowDialog();

                if (result == true)
                {
                    string filename = openFileDialog.FileName;
                    var pdfReader = new PdfReader(filename);
                    for (int i = 1; i <= pdfReader.NumberOfPages; i++)
                    {
                        byte[] pagesBytes = pdfReader.GetPageContent(i);
                        var token = new PRTokeniser(pagesBytes);
                        var pageContent = new StringBuilder();
                        while (token.NextToken())
                        {
                            if (token.TokenType == PRTokeniser.TokType.STRING)
                            {
                                pageContent.Append(token.StringValue);
                            }
                        }
                        _pdfPages.Add(pageContent.ToString());
                    }
                }
                RaisePropertyChanged("MaxIndex");
            }
            catch (Exception)
            {
                MessageBox.Show("Fail to load file");
            }
            CurrentIndex = 1;
        }
示例#21
0
        public string ExtractText(string inFileName, out int tot)
        {
            string outs = "";
            try
            {
                
                PdfReader reader = new PdfReader(inFileName);
             
                Debug.WriteLine("Processing: ");

                int totalLen = 68;
                float charUnit = ((float)totalLen) / (float)reader.NumberOfPages;
                int totalWritten = 0;
                float curUnit = 0;

                for (int page = 1; page <= reader.NumberOfPages; page++)
                {
                    string k = Encoding.GetEncoding("koi8-r") .GetString(reader.GetPageContent(page));//ExtractTextFromPDFBytes(reader.GetPageContent(page));
                    string wk = k;//Encoding.GetEncoding("utf-8").GetString(reader.GetPageContent(page));//Encoding.GetEncoding("koi8r").GetString(reader.GetPageContent(page));
                   // string k = ExtractTextFromPDFBytes(Encoding.GetEncoding("koi8r").GetBytes(wk));*/
                    outs += wk + " ";



                    // Write the progress.
                    if (charUnit >= 1.0f)
                    {
                        for (int i = 0; i < (int)charUnit; i++)
                        {
                            Debug.WriteLine("#");
                            totalWritten++;
                        }
                    }
                    else
                    {
                        curUnit += charUnit;
                        if (curUnit >= 1.0f)
                        {
                            for (int i = 0; i < (int)curUnit; i++)
                            {
                                Debug.WriteLine("#");
                                totalWritten++;
                            }
                            curUnit = 0;
                        }

                    }
                }

                if (totalWritten < totalLen)
                {
                    for (int i = 0; i < (totalLen - totalWritten); i++)
                    {
                        Debug.WriteLine("#");
                    }
                }
                tot = totalWritten;
                return outs;
            }
            catch (Exception ex)
            {
                Debug.WriteLine("2"+ex.Message);
                tot = -1;
                return "-1";
                
            }
            finally
            {
              //  if (outFile != null) outFile.Close();
            }
        }
示例#22
0
        virtual public bool CompareInnerText(String path1, String path2) {
            PdfReader reader1 = new PdfReader(path1);
            byte[] streamBytes1 = reader1.GetPageContent(1);
            PRTokeniser tokenizer1 =
                new PRTokeniser(new RandomAccessFileOrArray(new RandomAccessSourceFactory().CreateSource(streamBytes1)));



            PdfReader reader2 = new PdfReader(path2);
            byte[] streamBytes2 = reader2.GetPageContent(1);
            PRTokeniser tokenizer2 =
                new PRTokeniser(new RandomAccessFileOrArray(new RandomAccessSourceFactory().CreateSource(streamBytes2)));

            try {
                while (tokenizer1.NextToken()) {
                    if (!tokenizer2.NextToken())
                        return false;
                    else {
                        if (tokenizer1.TokenType != tokenizer2.TokenType)
                            return false;
                        else {
                            if (tokenizer1.TokenType == tokenizer2.TokenType && tokenizer2.TokenType == PRTokeniser.TokType.NUMBER) {
                                if (Math.Abs(float.Parse(tokenizer1.StringValue, CultureInfo.InvariantCulture)
                                             - float.Parse(tokenizer2.StringValue, CultureInfo.InvariantCulture)) > 0.001)
                                    return false;
                            } else if (!tokenizer1.StringValue.Equals(tokenizer2.StringValue))
                                return false;
                        }

                    }
                }
                return true;
            }
            finally {
                reader1.Close();
                reader2.Close();
            }
        }
示例#23
0
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            AspectF.Define.
                NotNull(crawler, "crawler").
                NotNull(propertyBag, "propertyBag");

            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return;
            }

            if (!IsPdfContent(propertyBag.ContentType))
            {
                return;
            }

            PdfReader pdfReader = new PdfReader(propertyBag.Response);
            try
            {
                object title = pdfReader.Info["Title"];
                if (!title.IsNull())
                {
                    string pdfTitle = Convert.ToString(title, CultureInfo.InvariantCulture).Trim();
                    if (!pdfTitle.IsNullOrEmpty())
                    {
                        propertyBag.Title = pdfTitle;
                    }
                }

                StringBuilder sb = new StringBuilder();
                // Following code from:
                // http://www.vbforums.com/showthread.php?t=475759
                for (int p = 1; p <= pdfReader.NumberOfPages; p++)
                {
                    byte[] pageBytes = pdfReader.GetPageContent(p);

                    if (pageBytes.IsNull())
                    {
                        continue;
                    }

                    PRTokeniser token = new PRTokeniser(pageBytes);
                    while (token.NextToken())
                    {
                        int tknType = token.TokenType;
                        string tknValue = token.StringValue;

                        if (tknType == PRTokeniser.TK_STRING)
                        {
                            sb.Append(token.StringValue);
                            sb.Append(" ");
                        }
                        else if (tknType == 1 && tknValue == "-600")
                        {
                            sb.Append(" ");
                        }
                        else if (tknType == 10 && tknValue == "TJ")
                        {
                            sb.Append(" ");
                        }
                    }
                }

                propertyBag.Text = sb.ToString();
            }
            finally
            {
                pdfReader.Close();
            }
        }
示例#24
0
// ---------------------------------------------------------------------------    
    /**
     * Parses the PDF using PRTokeniser
     * @param src the ]original PDF file
]     */
    public string ParsePdf(byte[] src) {
      PdfReader reader = new PdfReader(src);
      // we can inspect the syntax of the imported page
      byte[] streamBytes = reader.GetPageContent(1);
      StringBuilder sb = new StringBuilder();
      PRTokeniser tokenizer = new PRTokeniser(streamBytes);
      while (tokenizer.NextToken()) {
        if (tokenizer.TokenType == PRTokeniser.TokType.STRING) {
          sb.AppendLine(tokenizer.StringValue);
        }
      }
      return sb.ToString();
    }
示例#25
0
// ---------------------------------------------------------------------------
    /**
     * Manipulates a PDF file src with the file dest as result
     * @param src the original PDF
     */
    public byte[] DecompressPdf(byte[] src) {
      PdfReader reader = new PdfReader(src);
      using (MemoryStream ms = new MemoryStream()) {
        using (PdfStamper stamper = new PdfStamper(reader, ms)) {
          Document.Compress = false;
          int total = reader.NumberOfPages + 1;
          for (int i = 1; i < total; i++) {
            reader.SetPageContent(i, reader.GetPageContent(i));
          }
        }
        Document.Compress = true;
        return ms.ToArray();
      }
    }
示例#26
0
        private bool ArePagesIdentical(String firstPdf, int firstStartPage, int firstLastPage,
                                       String secondPdf, int secondStartPage)
        {
            bool pagesAreIdentical = true;
            var  firstPdfReader    = new iTextSharpPDF.PdfReader(new iTextSharpPDF.RandomAccessFileOrArray(firstPdf), null);
            var  secondPdfReader   = new iTextSharpPDF.PdfReader(new iTextSharpPDF.RandomAccessFileOrArray(secondPdf), null);


            int secondPdfPage = secondStartPage;

            try
            {
                for (int currentFirstPage = firstStartPage; currentFirstPage < firstLastPage; currentFirstPage++)
                {
                    if (BitConverter.ToInt32(new MD5CryptoServiceProvider().ComputeHash(firstPdfReader.GetPageContent(currentFirstPage)), 0)
                        != BitConverter.ToInt32(new MD5CryptoServiceProvider().ComputeHash(secondPdfReader.GetPageContent(secondPdfPage)), 0))
                    {
                        pagesAreIdentical = false;
                        break;
                    }
                    secondPdfPage++;
                }
            }
            finally
            {
                if (firstPdfReader != null)
                {
                    firstPdfReader.Close();
                }
                if (secondPdfReader != null)
                {
                    secondPdfReader.Close();
                }
            }
            return(pagesAreIdentical);
        }
示例#27
0
        /**
         * Writes information about a specific page from PdfReader to the specified output stream.
         * @since 2.1.5
         * @param reader    the PdfReader to read the page content from
         * @param pageNum   the page number to read
         * @param out       the output stream to send the content to
         * @throws IOException
         */
        public static void ListContentStreamForPage(PdfReader reader, int pageNum, TextWriter outp) {
            outp.WriteLine("==============Page " + pageNum + "====================");
            outp.WriteLine("- - - - - Dictionary - - - - - -");
            PdfDictionary pageDictionary = reader.GetPageN(pageNum);
            outp.WriteLine(GetDictionaryDetail(pageDictionary));

            outp.WriteLine("- - - - - XObject Summary - - - - - -");
            outp.WriteLine(GetXObjectDetail(pageDictionary.GetAsDict(PdfName.RESOURCES)));
            
            outp.WriteLine("- - - - - Content Stream - - - - - -");
            RandomAccessFileOrArray f = reader.SafeFile;

            byte[] contentBytes = reader.GetPageContent(pageNum, f);
            f.Close();

            outp.Flush();

            foreach (byte b in contentBytes) {
                outp.Write((char)b);
            }

            outp.Flush();
            
            outp.WriteLine("- - - - - Text Extraction - - - - - -");
            String extractedText = PdfTextExtractor.GetTextFromPage(reader, pageNum, new LocationTextExtractionStrategy());
            if (extractedText.Length != 0)
                outp.WriteLine(extractedText);
            else
                outp.WriteLine("No text found on page " + pageNum);

            outp.WriteLine();

        }
示例#28
0
文件: xpdf.cs 项目: labeuze/source
        public static void xpdfPage(PdfReader reader, int pageNum, TextWriter outp)
        {
            outp.WriteLine("==============Page " + pageNum + "====================");
            PdfDictionary pageDictionary = reader.GetPageN(pageNum);
            if (_outputDictionary)
            {
                outp.WriteLine("- - - - - Dictionary - - - - - -");
                //outp.WriteLine(PdfContentReaderTool.GetDictionaryDetail(pageDictionary));
                //string s = PdfContentReaderTool.GetDictionaryDetail(pageDictionary);
                string s = GetDictionaryDetail(pageDictionary);
                outp.WriteLine(s);
            }

            if (_outputXObject)
            {
                outp.WriteLine("- - - - - XObject summary - - - - - -");
                outp.WriteLine(PdfContentReaderTool.GetXObjectDetail(pageDictionary.GetAsDict(PdfName.RESOURCES)));
            }

            if (_outputContentStream)
            {
                outp.WriteLine("- - - - - Content stream - - - - - -");
                RandomAccessFileOrArray f = reader.SafeFile;
                byte[] contentBytes = reader.GetPageContent(pageNum, f);
                f.Close();

                outp.Flush();

                foreach (byte b in contentBytes)
                {
                    outp.Write((char)b);
                }

                outp.Flush();
            }

            Test_iTextSharp.LocationTextExtractionStrategy strategy = new Test_iTextSharp.LocationTextExtractionStrategy();
            //GetTextFromPage(reader, pageNum, strategy);
            Test_iTextSharp.PdfTools.ProcessContentPage(reader, pageNum, strategy);

            if (_outputText)
            {
                outp.WriteLine("- - - - - Text extraction - - - - - -");
                //LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy();
                //String extractedText = PdfTextExtractor.GetTextFromPage(reader, pageNum, new LocationTextExtractionStrategy());
                string extractedText = strategy.GetResultantText();
                if (extractedText.Length != 0)
                {
                    outp.WriteLine(extractedText);
                    outp.WriteLine();
                }
                else
                    outp.WriteLine("No text found on page " + pageNum);
            }

            if (_outputTextBlocks1)
            {
                outp.WriteLine("- - - - - Text blocks extraction 1 - - - - - -");
                //GetTextFromPage(reader, pageNum, strategy);
                //PrintTextBlocks(outp, strategy.textBlocks);
                foreach (Test_iTextSharp.TextBlock textBlock in strategy.textBlocks)
                    PrintTextBlock(outp, textBlock, 0);
                outp.WriteLine();
            }

            if (_outputTextBlocks2)
            {
                outp.WriteLine("- - - - - Text blocks extraction 2 - - - - - -");
                foreach (Test_iTextSharp.TextBlock textBlock in strategy.textBlocks)
                {
                    outp.Write("block  ");
                    //outp.WriteLine(GetTextBlock(textBlock));
                    outp.WriteLine(textBlock.GetText());
                    if (textBlock.childs.Count > 0)
                        outp.WriteLine("   **** warning childs blocks not printed ****");
                }
                outp.WriteLine();
            }

            if (_outputTextBlocks3)
            {
                outp.WriteLine("- - - - - Text blocks extraction 3 - - - - - -");
                foreach (Test_iTextSharp.TextBlock textBlock in strategy.textBlocks)
                {
                    bool first = true;
                    //foreach (string s in GetTextBlockByLines(textBlock, _outputMaxCol))
                    foreach (string s in textBlock.GetTextByLines(_outputMaxCol))
                    {
                        if (first)
                        {
                            outp.Write("block  ");
                            first = false;
                        }
                        else
                            outp.Write("       ");
                        outp.WriteLine(s);
                    }
                    if (textBlock.childs.Count > 0)
                        outp.WriteLine("   **** warning childs blocks not printed ****");
                }
                outp.WriteLine();
            }

            outp.WriteLine();
        }