Пример #1
0
        private static void ExtractLines(PdfReader reader, StringBuilder sb, PdfObject content)
        {
            var ir = (PRIndirectReference)content;

            var value = reader.GetPdfObject(ir.Number);

            if (value.IsStream())
            {
                PRStream stream = (PRStream)value;

                var streamBytes = PdfReader.GetStreamBytes(stream);

                var tokenizer = new PRTokeniser(new RandomAccessFileOrArray(streamBytes));

                try
                {
                    while (tokenizer.NextToken())
                    {
                        if (tokenizer.TokenType == PRTokeniser.TK_STRING)
                        {
                            string str = tokenizer.StringValue;
                            sb.Append(str);
                        }
                    }
                }
                finally
                {
                    tokenizer.Close();
                }
            }
        }
Пример #2
0
        private string ReadPDF(string filePath)
        {
            var       builder  = new StringBuilder();
            PdfReader document = null;

            try
            {
                document = new PdfReader(filePath);

                for (int i = 1; i <= document.NumberOfPages; i++)
                {
                    byte[] stream    = document.GetPageContent(i);
                    var    tokenizer = new PRTokeniser(new RandomAccessFileOrArray(stream));
                    while (tokenizer.NextToken())
                    {
                        if (tokenizer.TokenType == PRTokeniser.TokType.STRING)
                        {
                            builder.Append(tokenizer.StringValue);
                        }
                    }
                }
            }
            catch (Exception e)
            {
                builder.Append(e.Message);
            }
            finally
            {
                document?.Close();
            }

            return(builder.ToString());
        }
        private static void ParsePdf(byte[] pdf, IPdfParsingStrategy strategy)
        {
            PdfReader reader = new PdfReader(pdf);

            for (int i = 1; i <= reader.NumberOfPages; i++)
            {
                byte[] page = reader.GetPageContent(i);
                if (page != null)
                {
                    PRTokeniser     tokenizer  = new PRTokeniser(page);
                    List <PdfToken> parameters = new List <PdfToken>();
                    while (tokenizer.NextToken())
                    {
                        var token = PdfToken.Create(tokenizer);
                        if (token.IsOperand)
                        {
                            strategy.Execute(new PdfOperation(token, parameters));
                            parameters.Clear();
                        }
                        else
                        {
                            parameters.Add(token);
                        }
                    }
                }
            }
        }
Пример #4
0
        IDictionary <string, IList <object> > ParseDAParam(PdfString DA)
        {
            IDictionary <string, IList <object> > commandArguments = new Dictionary <string, IList <object> >();

            PRTokeniser    tokeniser        = new PRTokeniser(new RandomAccessFileOrArray(new RandomAccessSourceFactory().CreateSource(DA.GetBytes())));
            IList <object> currentArguments = new List <object>();

            while (tokeniser.NextToken())
            {
                if (tokeniser.TokenType == PRTokeniser.TokType.OTHER)
                {
                    String key = tokeniser.StringValue;

                    if (key == "RG" || key == "G" || key == "K")
                    {
                        key = STROKE_COLOR;
                    }
                    else if (key == "rg" || key == "g" || key == "k")
                    {
                        key = FILL_COLOR;
                    }

                    if (commandArguments.ContainsKey(key))
                    {
                        commandArguments[key] = currentArguments;
                    }
                    else
                    {
                        commandArguments.Add(key, currentArguments);
                    }

                    currentArguments = new List <object>();
                }
                else
                {
                    switch (tokeniser.TokenType)
                    {
                    case PRTokeniser.TokType.NUMBER:
                        currentArguments.Add(new PdfNumber(tokeniser.StringValue));
                        break;

                    case PRTokeniser.TokType.NAME:
                        currentArguments.Add(new PdfName(tokeniser.StringValue));
                        break;

                    default:
                        currentArguments.Add(tokeniser.StringValue);
                        break;
                    }
                }
            }

            return(commandArguments);
        }
Пример #5
0
        virtual public bool CompareInnerText(String path1, String path2)
        {
            PdfReader reader1 = new PdfReader(path1);

            byte[]      streamBytes1 = reader1.GetPageContent(1);
            PRTokeniser tokenizer1   =
                new PRTokeniser(new RandomAccessFileOrArray(new RandomAccessSourceFactory().CreateSource(streamBytes1)));



            PdfReader reader2 = new PdfReader(path2);

            byte[]      streamBytes2 = reader2.GetPageContent(1);
            PRTokeniser tokenizer2   =
                new PRTokeniser(new RandomAccessFileOrArray(new RandomAccessSourceFactory().CreateSource(streamBytes2)));

            try {
                while (tokenizer1.NextToken())
                {
                    if (!tokenizer2.NextToken())
                    {
                        return(false);
                    }
                    else
                    {
                        if (tokenizer1.TokenType != tokenizer2.TokenType)
                        {
                            return(false);
                        }
                        else
                        {
                            if (tokenizer1.TokenType == tokenizer2.TokenType && tokenizer2.TokenType == PRTokeniser.TokType.NUMBER)
                            {
                                if (Math.Abs(float.Parse(tokenizer1.StringValue, CultureInfo.InvariantCulture)
                                             - float.Parse(tokenizer2.StringValue, CultureInfo.InvariantCulture)) > 0.001)
                                {
                                    return(false);
                                }
                            }
                            else if (!tokenizer1.StringValue.Equals(tokenizer2.StringValue))
                            {
                                return(false);
                            }
                        }
                    }
                }
                return(true);
            }
            finally {
                reader1.Close();
                reader2.Close();
            }
        }
Пример #6
0
        public string ParsePdf(string filePath)
        {
            string text = string.Empty;

            PdfReader reader = new iTextSharp.text.pdf.PdfReader(filePath);

            byte[] streamBytes = reader.GetPageContent(1);

            FileStream fStream = File.OpenRead(filePath);

            byte[] contents = new byte[fStream.Length];

            fStream.Read(contents, 0, (int)fStream.Length);

            fStream.Close();

            string s     = Encoding.UTF8.GetString(contents, 0, contents.Length);
            var    table = (Encoding.Default.GetString(streamBytes, 0, streamBytes.Length - 1)).Split(new string[] { "\r\n", "\r", "\n" }, StringSplitOptions.None);

            byte[]      buf        = Encoding.Convert(Encoding.GetEncoding("iso-8859-1"), Encoding.UTF8, streamBytes);
            string      tempString = Encoding.UTF8.GetString(buf, 0, buf.Count());
            PRTokeniser tokenizer  = new PRTokeniser(streamBytes);

            while (tokenizer.NextToken())
            {
                if (tokenizer.TokenType == PRTokeniser.TK_STRING)
                {
                    text += tokenizer.StringValue;
                }
            }

            // create a reader (constructor overloaded for path to local file or URL)
            //PdfReader reader
            //    = new PdfReader("http://www.chinehamchat.com/Chineham_Chat_Advertisements.pdf");
            // total number of pages
            int n = reader.NumberOfPages;
            // size of the first page
            Rectangle psize = reader.GetPageSize(1);
            //float width = psize.Width;
            //float height = psize.Height;
            //Console.WriteLine("Size of page 1 of {0} => {1} × {2}", n, width, height);
            // file properties
            Hashtable   infoHash = reader.Info;
            ICollection keys     = infoHash.Keys;

            // Dictionary<string, string> infodict = (Dictionary<string,string>)reader.Info;
            foreach (string key in keys)
            {
                text += key + " => " + infoHash[key];
            }
            // Console.WriteLine(key+ " => " + infoHash[key]);
            return(text);
        }
Пример #7
0
// ---------------------------------------------------------------------------

        /**
         * Parses the PDF using PRTokeniser
         * @param src the ]original PDF file
         * ]     */
        public string ParsePdf(byte[] src)
        {
            PdfReader reader = new PdfReader(src);

            // we can inspect the syntax of the imported page
            byte[]        streamBytes = reader.GetPageContent(1);
            StringBuilder sb          = new StringBuilder();
            PRTokeniser   tokenizer   = new PRTokeniser(streamBytes);

            while (tokenizer.NextToken())
            {
                if (tokenizer.TokenType == PRTokeniser.TokType.STRING)
                {
                    sb.AppendLine(tokenizer.StringValue);
                }
            }
            return(sb.ToString());
        }
Пример #8
0
        /// <summary>
        /// Old algorithm designed to work with iTextSharp 4.1.6. Use iTextSharp version >= 5 if possible (license changes were made).
        /// </summary>
        /// <param name="input"></param>
        /// <returns></returns>
        internal static string ExtractTextFromPdfBytes(byte[] input)
        {
            if (input == null || input.Length == 0)
            {
                return("");
            }

            var result    = new StringBuilder();
            var tokeniser = new PRTokeniser(input);

            try
            {
                while (tokeniser.NextToken())
                {
                    var tknType  = tokeniser.TokenType;
                    var tknValue = tokeniser.StringValue.Replace('\0', ' ');

                    if (tknType == PRTokeniser.TK_STRING)
                    {
                        result.Append(tknValue);
                    }
                    else
                    {
                        switch (tknValue)
                        {
                        case "-600":
                            result.Append(" ");
                            break;

                        case "TJ":
                            result.Append(" ");
                            break;
                        }
                    }
                }
            }
            finally
            {
                tokeniser.Close();
            }

            return(result.ToString());
        }
Пример #9
0
        public List <DataTable> Load(MemoryStream stream)
        {
            var tables = new List <DataTable>();
            var sb     = new StringBuilder();
            var reader = new PdfReader(stream);

            for (int page = 1; page <= reader.NumberOfPages; page++)
            {
                var cpage   = reader.GetPageN(page);
                var content = cpage.Get(PdfName.CONTENTS);

                var ir = (PRIndirectReference)content;

                var value = reader.GetPdfObject(ir.Number);

                if (value.IsStream())
                {
                    PRStream prstream = (PRStream)value;

                    var streamBytes = PdfReader.GetStreamBytes(prstream);

                    var tokenizer = new PRTokeniser(new RandomAccessFileOrArray(streamBytes));

                    try
                    {
                        while (tokenizer.NextToken())
                        {
                            if (tokenizer.TokenType == PRTokeniser.TK_STRING)
                            {
                                string str = tokenizer.StringValue;
                                sb.AppendLine(str);
                            }
                        }
                    }
                    finally
                    {
                        tokenizer.Close();
                    }
                }
            }
            Console.WriteLine(sb.ToString());
            return(tables);
        }
        static void Main(string[] args)
        {
            string        pdfPath = "C:\\mypdf.pdf";
            PdfReader     reader  = new PdfReader(pdfPath);
            StringBuilder sb      = new StringBuilder();

            for (int page = 1; page <= reader.NumberOfPages; page++)
            {
                var cpage   = reader.GetPageN(page);
                var content = cpage.Get(PdfName.CONTENTS);
                var ir      = (PRIndirectReference)content;
                var value   = reader.GetPdfObject(ir.Number);
                if (value.IsStream())
                {
                    PRStream stream      = (PRStream)value;
                    var      streamBytes = PdfReader.GetStreamBytes(stream);
                    var      tokenizer   = new PRTokeniser(new RandomAccessFileOrArray(streamBytes));

                    try
                    {
                        while (tokenizer.NextToken())
                        {
                            if (tokenizer.TokenType == PRTokeniser.TK_STRING)
                            {
                                string str = tokenizer.StringValue;
                                sb.Append(str);
                            }
                        }
                    }
                    finally
                    {
                        tokenizer.Close();
                    }
                }
            }

            Console.Write("PDF Content:" + Environment.NewLine);
            Console.Write(sb.ToString());
            Console.Write(Environment.NewLine + "--EOF--");
        }
Пример #11
0
        private static List <Line> FindRectangles(string sourceFile, int pageNumber)
        {
            //Source file to read from

            var listOfLines = new List <Line>();


            //Bind a reader to our PDF
            using (PdfReader reader = new PdfReader(sourceFile))
            {
                //Create our buffer for previous token values. For Java users, List<string> is a generic list, probably most similar to an ArrayList
                List <string> buf = new List <string>();

                //Get the raw bytes for the page
                byte[] pageBytes = reader.GetPageContent(pageNumber);
                //Get the raw tokens from the bytes

                PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(pageBytes));

                //Create some variables to set later
                PRTokeniser.TokType tokenType;
                string tokenValue;

                int countOfLines      = 0;
                var AllowDecimalPoint = System.Globalization.NumberStyles.AllowDecimalPoint;

                //Loop through each token
                while (tokeniser.NextToken())
                {
                    //Get the types and value
                    tokenType  = tokeniser.TokenType;
                    tokenValue = tokeniser.StringValue;
                    //If the type is a numeric type
                    if (tokenType == PRTokeniser.TokType.NUMBER)
                    {
                        //Store it in our buffer for later user
                        buf.Add(tokenValue);
                        //Otherwise we only care about raw commands which are categorized as "OTHER"
                    }
                    else if (tokenType == PRTokeniser.TokType.OTHER)
                    {
                        //Look for a rectangle token
                        //if (tokenValue == "re")
                        if (tokenValue == "l")
                        {
                            //Sanity check, make sure we have enough items in the buffer
                            if (buf.Count < 2)
                            {
                                throw new Exception("Not enough elements in buffer for a rectangle");
                            }
                            countOfLines += 1;
                            //Read and convert the values
                            float x2 = float.Parse(buf[buf.Count - 2], AllowDecimalPoint);
                            float y2 = float.Parse(buf[buf.Count - 1], AllowDecimalPoint);
                            float x1 = float.Parse(buf[buf.Count - 4], AllowDecimalPoint);
                            float y1 = float.Parse(buf[buf.Count - 3], AllowDecimalPoint);
                            //Console.WriteLine($"{countOfLines} : ({x1}, {y1}) - ({x2}, {y2})");

                            listOfLines.Add(new Line()
                            {
                                BeginX = x1, BeginY = y1, EndX = x2, EndY = y2
                            });
                            //..do something with them here
                        }
                    }
                }
            }

            listOfLines.Sort();


            //foreach (Line line in listOfLines)
            //{
            //    countOfLines += 1;
            //    Console.WriteLine($"{countOfLines}: {line}");

            //}

            return(listOfLines);
        }
Пример #12
0
        public void ExtractTextTest1()
        {
            PDFManager pdfManager = new PDFManager(); // TODO: Initialize to an appropriate value

            //byte[] input = File.ReadAllBytes(DiscoveryManager.GetDiscoveryPath("M:\\DFD", "http://unicode.org/charts/PDF/U0590.pdf", ".pdf"));

            byte[] input = File.ReadAllBytes(@"");


            string path = @"M:\COL\hebrew.pdf";
            string destinationFileName = @"M:\COL\hebrew1.pdf";


            PdfReader reader   = new PdfReader(path);
            int       n        = reader.NumberOfPages;
            Document  document = new Document(PageSize.A4);

            PdfWriter writer = PdfWriter.GetInstance(document, new FileStream(destinationFileName, FileMode.Create));

            int i = 0;

            document.Open();

            PdfContentByte cb = writer.DirectContent;


            PdfTemplate template = cb.CreateTemplate(0, 0);


            while (i < n)
            {
                document.NewPage();
                i++;

                PdfImportedPage importedPage = writer.GetImportedPage(reader, i);


                Image img = Image.GetInstance(importedPage);

                img.ScalePercent(100);
                document.Add(img);
                cb.AddTemplate(importedPage, 0, 100);
            }


            document.Close();
            writer.Close();


            PdfReader pdfReader = new PdfReader(input);

            StringBuilder stringBuilder = new StringBuilder();

            string dingle = string.Empty;

            for (int page = 1; page <= pdfReader.NumberOfPages; page++)
            {
                stringBuilder.Append(pdfManager.ExtractText(pdfReader.GetPageContent(page)) + " ");

                PRTokeniser prTokeniser = new PRTokeniser(pdfReader.GetPageContent(page));


                PdfDictionary pdfDictionary = pdfReader.GetPageN(page);

                byte[] dinas = pdfReader.GetPageContent(page);

                string winsdgf = Encoding.GetEncoding(1255).GetString(dinas);


                try
                {
                    while (prTokeniser.NextToken())
                    {
                        if (prTokeniser.TokenType == PRTokeniser.TokType.STRING)
                        {
                            dingle += prTokeniser.StringValue;

                            try
                            {
                                //dingle += (char)(int.Parse(prTokeniser.StringValue));

                                //dingle += iTextSharp.text.Utilities.ConvertFromUtf32(prTokeniser.FilePointer);

                                //dingle += ((char)prTokeniser.Read()).ToString();

                                dingle += prTokeniser.ReadString(2);
                                Chunk chunk = new Chunk(prTokeniser.StringValue);

                                //string wangle = PRTokeniser.GetHex(prTokeniser.IntValue).ToString();
                            }
                            catch (Exception)
                            {
                            }
                        }
                    }
                }
                catch (Exception)
                {
                    {
                    }
                    //throw;
                }

                //int ij = 0;

//                #
//If Not IsNothing(pageBytes) Then
//#
//                    token = New PRTokeniser(pageBytes)
//#
//                    While token.NextToken()
//#
//                        tknType = token.TokenType()
//#
//                        tknValue = token.StringValue
//#
//                        If tknType = PRTokeniser.TK_STRING Then
//#
//                            sb.Append(token.StringValue)
//#
//                        'I need to add these additional tests to properly add whitespace to the output string
//#
//                        ElseIf tknType = 1 AndAlso tknValue = "-600" Then
//#
//                            sb.Append(" ")
//#
//                        ElseIf tknType = 10 AndAlso tknValue = "TJ" Then
//#
//                            sb.Append(" ")
//#
//                        End If
//#
//                   End While
            }

            string actual = pdfManager.ExtractText(input);
        }
Пример #13
0
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            AspectF.Define.
            NotNull(crawler, "crawler").
            NotNull(propertyBag, "propertyBag");

            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return;
            }

            if (!IsPdfContent(propertyBag.ContentType))
            {
                return;
            }

            PdfReader pdfReader = new PdfReader(propertyBag.Response);

            try
            {
                object title = pdfReader.Info["Title"];
                if (!title.IsNull())
                {
                    string pdfTitle = Convert.ToString(title, CultureInfo.InvariantCulture).Trim();
                    if (!pdfTitle.IsNullOrEmpty())
                    {
                        propertyBag.Title = pdfTitle;
                    }
                }

                StringBuilder sb = new StringBuilder();
                // Following code from:
                // http://www.vbforums.com/showthread.php?t=475759
                for (int p = 1; p <= pdfReader.NumberOfPages; p++)
                {
                    byte[] pageBytes = pdfReader.GetPageContent(p);

                    if (pageBytes.IsNull())
                    {
                        continue;
                    }

                    PRTokeniser token = new PRTokeniser(pageBytes);
                    while (token.NextToken())
                    {
                        int    tknType  = token.TokenType;
                        string tknValue = token.StringValue;

                        if (tknType == PRTokeniser.TK_STRING)
                        {
                            sb.Append(token.StringValue);
                            sb.Append(" ");
                        }
                        else if (tknType == 1 && tknValue == "-600")
                        {
                            sb.Append(" ");
                        }
                        else if (tknType == 10 && tknValue == "TJ")
                        {
                            sb.Append(" ");
                        }
                    }
                }

                propertyBag.Text = sb.ToString();
            }
            finally
            {
                pdfReader.Close();
            }
        }