public void ExtractTextTest1() { PDFManager pdfManager = new PDFManager(); // TODO: Initialize to an appropriate value //byte[] input = File.ReadAllBytes(DiscoveryManager.GetDiscoveryPath("M:\\DFD", "http://unicode.org/charts/PDF/U0590.pdf", ".pdf")); byte[] input = File.ReadAllBytes(@""); string path = @"M:\COL\hebrew.pdf"; string destinationFileName = @"M:\COL\hebrew1.pdf"; PdfReader reader = new PdfReader(path); int n = reader.NumberOfPages; Document document = new Document(PageSize.A4); PdfWriter writer = PdfWriter.GetInstance(document, new FileStream(destinationFileName, FileMode.Create)); int i = 0; document.Open(); PdfContentByte cb = writer.DirectContent; PdfTemplate template = cb.CreateTemplate(0, 0); while (i < n) { document.NewPage(); i++; PdfImportedPage importedPage = writer.GetImportedPage(reader, i); Image img = Image.GetInstance(importedPage); img.ScalePercent(100); document.Add(img); cb.AddTemplate(importedPage, 0, 100); } document.Close(); writer.Close(); PdfReader pdfReader = new PdfReader(input); StringBuilder stringBuilder = new StringBuilder(); string dingle = string.Empty; for (int page = 1; page <= pdfReader.NumberOfPages; page++) { stringBuilder.Append(pdfManager.ExtractText(pdfReader.GetPageContent(page)) + " "); PRTokeniser prTokeniser = new PRTokeniser(pdfReader.GetPageContent(page)); PdfDictionary pdfDictionary = pdfReader.GetPageN(page); byte[] dinas = pdfReader.GetPageContent(page); string winsdgf = Encoding.GetEncoding(1255).GetString(dinas); try { while (prTokeniser.NextToken()) { if (prTokeniser.TokenType == PRTokeniser.TokType.STRING) { dingle += prTokeniser.StringValue; try { //dingle += (char)(int.Parse(prTokeniser.StringValue)); //dingle += iTextSharp.text.Utilities.ConvertFromUtf32(prTokeniser.FilePointer); //dingle += ((char)prTokeniser.Read()).ToString(); dingle += prTokeniser.ReadString(2); Chunk chunk = new Chunk(prTokeniser.StringValue); //string wangle = PRTokeniser.GetHex(prTokeniser.IntValue).ToString(); } catch (Exception) { } } } } catch (Exception) { { } //throw; } //int ij = 0; // # //If Not IsNothing(pageBytes) Then //# // token = New PRTokeniser(pageBytes) //# // While token.NextToken() //# // tknType = token.TokenType() //# // tknValue = token.StringValue //# // If tknType = PRTokeniser.TK_STRING Then //# // sb.Append(token.StringValue) //# // 'I need to add these additional tests to properly add whitespace to the output string //# // ElseIf tknType = 1 AndAlso tknValue = "-600" Then //# // sb.Append(" ") //# // ElseIf tknType = 10 AndAlso tknValue = "TJ" Then //# // sb.Append(" ") //# // End If //# // End While } string actual = pdfManager.ExtractText(input); }