Example #1
0
        public void ExtractTextTest1()
        {
            PDFManager pdfManager = new PDFManager(); // TODO: Initialize to an appropriate value

            //byte[] input = File.ReadAllBytes(DiscoveryManager.GetDiscoveryPath("M:\\DFD", "http://unicode.org/charts/PDF/U0590.pdf", ".pdf"));

            byte[] input = File.ReadAllBytes(@"");


            string path = @"M:\COL\hebrew.pdf";
            string destinationFileName = @"M:\COL\hebrew1.pdf";


            PdfReader reader   = new PdfReader(path);
            int       n        = reader.NumberOfPages;
            Document  document = new Document(PageSize.A4);

            PdfWriter writer = PdfWriter.GetInstance(document, new FileStream(destinationFileName, FileMode.Create));

            int i = 0;

            document.Open();

            PdfContentByte cb = writer.DirectContent;


            PdfTemplate template = cb.CreateTemplate(0, 0);


            while (i < n)
            {
                document.NewPage();
                i++;

                PdfImportedPage importedPage = writer.GetImportedPage(reader, i);


                Image img = Image.GetInstance(importedPage);

                img.ScalePercent(100);
                document.Add(img);
                cb.AddTemplate(importedPage, 0, 100);
            }


            document.Close();
            writer.Close();


            PdfReader pdfReader = new PdfReader(input);

            StringBuilder stringBuilder = new StringBuilder();

            string dingle = string.Empty;

            for (int page = 1; page <= pdfReader.NumberOfPages; page++)
            {
                stringBuilder.Append(pdfManager.ExtractText(pdfReader.GetPageContent(page)) + " ");

                PRTokeniser prTokeniser = new PRTokeniser(pdfReader.GetPageContent(page));


                PdfDictionary pdfDictionary = pdfReader.GetPageN(page);

                byte[] dinas = pdfReader.GetPageContent(page);

                string winsdgf = Encoding.GetEncoding(1255).GetString(dinas);


                try
                {
                    while (prTokeniser.NextToken())
                    {
                        if (prTokeniser.TokenType == PRTokeniser.TokType.STRING)
                        {
                            dingle += prTokeniser.StringValue;

                            try
                            {
                                //dingle += (char)(int.Parse(prTokeniser.StringValue));

                                //dingle += iTextSharp.text.Utilities.ConvertFromUtf32(prTokeniser.FilePointer);

                                //dingle += ((char)prTokeniser.Read()).ToString();

                                dingle += prTokeniser.ReadString(2);
                                Chunk chunk = new Chunk(prTokeniser.StringValue);

                                //string wangle = PRTokeniser.GetHex(prTokeniser.IntValue).ToString();
                            }
                            catch (Exception)
                            {
                            }
                        }
                    }
                }
                catch (Exception)
                {
                    {
                    }
                    //throw;
                }

                //int ij = 0;

//                #
//If Not IsNothing(pageBytes) Then
//#
//                    token = New PRTokeniser(pageBytes)
//#
//                    While token.NextToken()
//#
//                        tknType = token.TokenType()
//#
//                        tknValue = token.StringValue
//#
//                        If tknType = PRTokeniser.TK_STRING Then
//#
//                            sb.Append(token.StringValue)
//#
//                        'I need to add these additional tests to properly add whitespace to the output string
//#
//                        ElseIf tknType = 1 AndAlso tknValue = "-600" Then
//#
//                            sb.Append(" ")
//#
//                        ElseIf tknType = 10 AndAlso tknValue = "TJ" Then
//#
//                            sb.Append(" ")
//#
//                        End If
//#
//                   End While
            }

            string actual = pdfManager.ExtractText(input);
        }