Example #1
0
        private static void ExtractLines(PdfReader reader, StringBuilder sb, PdfObject content)
        {
            var ir = (PRIndirectReference)content;

            var value = reader.GetPdfObject(ir.Number);

            if (value.IsStream())
            {
                PRStream stream = (PRStream)value;

                var streamBytes = PdfReader.GetStreamBytes(stream);

                var tokenizer = new PRTokeniser(new RandomAccessFileOrArray(streamBytes));

                try
                {
                    while (tokenizer.NextToken())
                    {
                        if (tokenizer.TokenType == PRTokeniser.TK_STRING)
                        {
                            string str = tokenizer.StringValue;
                            sb.Append(str);
                        }
                    }
                }
                finally
                {
                    tokenizer.Close();
                }
            }
        }
Example #2
0
        /// <summary>
        /// Old algorithm designed to work with iTextSharp 4.1.6. Use iTextSharp version >= 5 if possible (license changes were made).
        /// </summary>
        /// <param name="input"></param>
        /// <returns></returns>
        internal static string ExtractTextFromPdfBytes(byte[] input)
        {
            if (input == null || input.Length == 0)
            {
                return("");
            }

            var result    = new StringBuilder();
            var tokeniser = new PRTokeniser(input);

            try
            {
                while (tokeniser.NextToken())
                {
                    var tknType  = tokeniser.TokenType;
                    var tknValue = tokeniser.StringValue.Replace('\0', ' ');

                    if (tknType == PRTokeniser.TK_STRING)
                    {
                        result.Append(tknValue);
                    }
                    else
                    {
                        switch (tknValue)
                        {
                        case "-600":
                            result.Append(" ");
                            break;

                        case "TJ":
                            result.Append(" ");
                            break;
                        }
                    }
                }
            }
            finally
            {
                tokeniser.Close();
            }

            return(result.ToString());
        }
Example #3
0
        public List <DataTable> Load(MemoryStream stream)
        {
            var tables = new List <DataTable>();
            var sb     = new StringBuilder();
            var reader = new PdfReader(stream);

            for (int page = 1; page <= reader.NumberOfPages; page++)
            {
                var cpage   = reader.GetPageN(page);
                var content = cpage.Get(PdfName.CONTENTS);

                var ir = (PRIndirectReference)content;

                var value = reader.GetPdfObject(ir.Number);

                if (value.IsStream())
                {
                    PRStream prstream = (PRStream)value;

                    var streamBytes = PdfReader.GetStreamBytes(prstream);

                    var tokenizer = new PRTokeniser(new RandomAccessFileOrArray(streamBytes));

                    try
                    {
                        while (tokenizer.NextToken())
                        {
                            if (tokenizer.TokenType == PRTokeniser.TK_STRING)
                            {
                                string str = tokenizer.StringValue;
                                sb.AppendLine(str);
                            }
                        }
                    }
                    finally
                    {
                        tokenizer.Close();
                    }
                }
            }
            Console.WriteLine(sb.ToString());
            return(tables);
        }
        static void Main(string[] args)
        {
            string        pdfPath = "C:\\mypdf.pdf";
            PdfReader     reader  = new PdfReader(pdfPath);
            StringBuilder sb      = new StringBuilder();

            for (int page = 1; page <= reader.NumberOfPages; page++)
            {
                var cpage   = reader.GetPageN(page);
                var content = cpage.Get(PdfName.CONTENTS);
                var ir      = (PRIndirectReference)content;
                var value   = reader.GetPdfObject(ir.Number);
                if (value.IsStream())
                {
                    PRStream stream      = (PRStream)value;
                    var      streamBytes = PdfReader.GetStreamBytes(stream);
                    var      tokenizer   = new PRTokeniser(new RandomAccessFileOrArray(streamBytes));

                    try
                    {
                        while (tokenizer.NextToken())
                        {
                            if (tokenizer.TokenType == PRTokeniser.TK_STRING)
                            {
                                string str = tokenizer.StringValue;
                                sb.Append(str);
                            }
                        }
                    }
                    finally
                    {
                        tokenizer.Close();
                    }
                }
            }

            Console.Write("PDF Content:" + Environment.NewLine);
            Console.Write(sb.ToString());
            Console.Write(Environment.NewLine + "--EOF--");
        }
Example #5
0
        private static void ParseCid(String cmapName, AbstractCMap cmap, ICidLocation location, int level)
        {
            if (level >= MAXLEVEL)
            {
                return;
            }
            PRTokeniser inp = location.GetLocation(cmapName);

            try {
                List <PdfObject> list = new List <PdfObject>();
                PdfContentParser cp   = new PdfContentParser(inp);
                int maxExc            = 50;
                while (true)
                {
                    try {
                        cp.Parse(list);
                    }
                    catch {
                        if (--maxExc < 0)
                        {
                            break;
                        }
                        continue;
                    }
                    if (list.Count == 0)
                    {
                        break;
                    }
                    String last = list[list.Count - 1].ToString();
                    if (level == 0 && list.Count == 3 && last.Equals(DEF))
                    {
                        PdfObject key = list[0];
                        if (PdfName.REGISTRY.Equals(key))
                        {
                            cmap.Registry = list[1].ToString();
                        }
                        else if (PdfName.ORDERING.Equals(key))
                        {
                            cmap.Ordering = list[1].ToString();
                        }
                        else if (CMAPNAME.Equals(key))
                        {
                            cmap.Name = list[1].ToString();
                        }
                        else if (PdfName.SUPPLEMENT.Equals(key))
                        {
                            try {
                                cmap.Supplement = ((PdfNumber)list[1]).IntValue;
                            }
                            catch {}
                        }
                    }
                    else if ((last.Equals(ENDCIDCHAR) || last.Equals(ENDBFCHAR)) && list.Count >= 3)
                    {
                        int lmax = list.Count - 2;
                        for (int k = 0; k < lmax; k += 2)
                        {
                            if (list[k] is PdfString)
                            {
                                cmap.AddChar((PdfString)list[k], list[k + 1]);
                            }
                        }
                    }
                    else if ((last.Equals(ENDCIDRANGE) || last.Equals(ENDBFRANGE)) && list.Count >= 4)
                    {
                        int lmax = list.Count - 3;
                        for (int k = 0; k < lmax; k += 3)
                        {
                            if (list[k] is PdfString && list[k + 1] is PdfString)
                            {
                                cmap.AddRange((PdfString)list[k], (PdfString)list[k + 1], list[k + 2]);
                            }
                        }
                    }
                    else if (last.Equals(USECMAP) && list.Count == 2 && list[0] is PdfName)
                    {
                        ParseCid(PdfName.DecodeName(list[0].ToString()), cmap, location, level + 1);
                    }
                }
            }
            finally {
                inp.Close();
            }
        }