Exemple #1
0
        public static string ExtractTextFromRawPdf(byte[] pdfFile, out ExtractedAuditLetterText extractedAuditLetterText, TimeSpan ts, DateTime dt1, int num)
        {
            //const string testReadPath = @"C:\Users\t-holu\Documents\AuditLetter\JixiProjectData\testread.txt";
            //const string testimageReadPath = @"C:\Users\t-holu\Documents\AuditLetter\JixiProjectData\testimageread.txt";



            List <MergedTraversedLine> txtFormattedLines   = new List <MergedTraversedLine>();
            List <MergedTraversedLine> imageFormattedLines = new List <MergedTraversedLine>();
            List <int> txtFormattedNum   = new List <int>();
            List <int> imageFormattedNum = new List <int>();

            //try {
            //    pdfRead.pdfObject.PdfDocument document = new pdfRead.pdfObject.PdfDocument(pdfFile);
            //} catch(Exception e) {
            //    throw e;
            //}

            //Merge image reader


            Spire.Pdf.PdfDocument imgDoc = new Spire.Pdf.PdfDocument(pdfFile);

            pdfRead.pdfObject.PdfDocument doc = new pdfRead.pdfObject.PdfDocument(pdfFile);
            List <PdfTextLine>            textLines;

            doc.PageTextLine(doc.pages.Count - 1);
            for (int i = 0; i < doc.pages.Count; i++)
            {
                textLines = doc.PageTextLine(i);
                if (textLines.Count > 2)
                {
                    txtFormattedNum.Add(i);
                }
                else
                {
                    imageFormattedNum.Add(i);
                }
            }


            ExtractedAuditLetterText extractedTexts      = new ExtractedAuditLetterText();
            ExtractedAuditLetterText extractedImageTexts = new ExtractedAuditLetterText();


            /// <summary>
            /// Get text from txt-formatted pdf
            /// </summary>
            if (txtFormattedNum.Count != 0)
            {
                //Get lines from pdf and they are grouped by their fontsizes.
                Dictionary <double, List <PdfTextLine> > linesKeyValue = doc.lineFontSize;
                int index = 0;
                foreach (var key in linesKeyValue)
                {
                    foreach (var value in key.Value)
                    {
                        var bdcLines = new MergedTraversedLine();
                        bdcLines.Index = index;
                        bdcLines.Text  = value.text;
                        index++;
                        txtFormattedLines.Add(bdcLines);
                    }
                }
                ExtractedAuditLetterText extractedTxtedTexts = PdfExtractor.Utilities.ConvertToExtractedAuditLetterTexts(txtFormattedLines);
                var littleImages = doc.ExtractImages();
                foreach (var img in littleImages)
                {
                    var lines = Utilities.ExtractLinesFromImage(img);
                    foreach (var line in lines)
                    {
                        string other = string.Join(" ", line.Words.Select(r => r.Text));
                        if (other == "auren")
                        {
                            other = "Auren";
                        }
                        extractedTexts.Others.Add(other);
                        //Console.WriteLine(other);
                    }
                }

                //foreach(int i in txtFormattedNum) {
                //    var littleImages = imgDoc.Pages[i].ExtractImages();
                //    foreach(var img in littleImages) {
                //        var lines = Utilities.ExtractLinesFromImage(img);
                //        foreach(var line in lines) {
                //            extractedTexts.Others.Add(string.Join(" ", line.Words.Select(r => r.Text)));
                //            Console.WriteLine(string.Join(" ", line.Words.Select(r => r.Text)));
                //        }
                //    }

                //}


                //string serialzedAuditLetterTxtedText = extractedTxtedTexts.SerializeExtractedAuditLetterText();
                //extractedAuditLetterText = extractedTxtedTexts;
                //return serialzedAuditLetterTxtedText;
            }


            /// <summary>
            /// Get text from txt-image-formatted pdf
            /// </summary>
            if (imageFormattedNum.Count != 0)
            {
                int titlePageStartIndexNum = 0;
                int titlePageEndIndexNum   = -1;

                List <Image>             imagePdf = new List <Image>();
                PdfiumViewer.PdfDocument document = PdfiumViewer.PdfDocument.Load(new MemoryStream(pdfFile));
                foreach (int i in imageFormattedNum)
                {
                    Image image = document.Render(i, Consts.DpiX, Consts.DpiY, PdfRenderFlags.CorrectFromDpi);
                    imagePdf.Add(image);
                }

                List <List <Line> > linesOfImagePdf = new List <List <Line> >();
                foreach (var img in imagePdf)
                {
                    linesOfImagePdf.Add(Utilities.ExtractLinesFromImage(img));
                }

                extractedTexts.Others.AddRange(Utilities.GetRedundantLines(imagePdf, linesOfImagePdf));

                StringBuilder sb = new StringBuilder();
                for (int i = 0; i < imagePdf.Count; i++)
                {
                    //Add small region text to ExtractedAuditLetterText.others
                    extractedTexts.Others.AddRange(Utilities.RemoveSmallRegion(imagePdf[i]));

                    //Get raw context in main body
                    List <TraversedLine> rawImageTexts = PdfExtractor.Utilities.GetContents(imagePdf[i]);

                    foreach (var line in rawImageTexts)
                    {
                        if (line.Text.IndexOf("sha-", StringComparison.OrdinalIgnoreCase) >= 0 || (line.Text.IndexOf("thumb", StringComparison.OrdinalIgnoreCase) >= 0))
                        {
                            line.Text = line.Text.Replace("O", "0").Replace("o", "0").Replace("i", "1").Replace("I", "1").Replace("l", "1");
                        }
                    }

                    //Merge raw context by paragraph
                    List <MergedTraversedLine> mergedImageTexts = PdfExtractor.Utilities.MergeTraversedLines(rawImageTexts);

                    titlePageEndIndexNum += mergedImageTexts.Count;
                    List <MergedTraversedLine> titleLines = mergedImageTexts.Where(x => x.IsTitle == true).ToList <MergedTraversedLine>();

                    //If this is a title page, add title page start index and end index to ExtractedAuditLetterText.TitlePageRanges
                    //If this is a title page, add the title index to ExtractedAuditLetterText.TitleRanges
                    if (titleLines.Count > 0)
                    {
                        extractedImageTexts.TitlePageRanges.Add(new KeyValuePair <int, int>(titlePageStartIndexNum, titlePageEndIndexNum));
                        foreach (MergedTraversedLine title in titleLines)
                        {
                            extractedImageTexts.TitleRanges.Add(new KeyValuePair <int, int>(title.Index + titlePageStartIndexNum, title.Index + titlePageStartIndexNum));
                        }
                    }
                    titlePageStartIndexNum += mergedImageTexts.Count;

                    //Add titles, contents
                    ExtractedAuditLetterText tempText = PdfExtractor.Utilities.ConvertToExtractedAuditLetterTexts(mergedImageTexts);

                    extractedImageTexts.Titles.AddRange(tempText.Titles);
                    extractedImageTexts.Contents.AddRange(tempText.Contents);
                }
            }

            if (imageFormattedNum.Count == 0)
            {
                ExtractedAuditLetterText tempText = PdfExtractor.Utilities.ConvertToExtractedAuditLetterTexts(txtFormattedLines);
                extractedTexts.Titles.AddRange(tempText.Titles);
                extractedTexts.Contents.AddRange(tempText.Contents);
                StringBuilder sb = new StringBuilder();
                sb.Append(num + "Ocr:     " + ts.TotalMilliseconds.ToString() + "  ");

                string serialzedAuditLetterText2 = extractedTexts.SerializeExtractedAuditLetterText();
                extractedAuditLetterText = extractedTexts;
                DateTime dt2 = System.DateTime.Now;
                ts = dt2.Subtract(dt1);

                sb.Append("   Reader:     " + ts.TotalMilliseconds.ToString() + "\n");
                string fileTime = @"C:\Users\t-holu\Documents\AuditLetter\JixiProjectData\comTextTime.txt";
                File.AppendAllText(fileTime, sb.ToString());
                return(serialzedAuditLetterText2);
            }
            else if (txtFormattedNum.Count == 0)
            {
                extractedTexts.Titles.AddRange(extractedImageTexts.Titles);
                extractedTexts.Contents.AddRange(extractedImageTexts.Contents);
                extractedTexts.TitleRanges.AddRange(extractedImageTexts.TitleRanges);
            }
            else
            {
                if (txtFormattedNum[0] < imageFormattedNum[0])
                {
                    ExtractedAuditLetterText tempText = PdfExtractor.Utilities.ConvertToExtractedAuditLetterTexts(txtFormattedLines);
                    extractedTexts.Titles.AddRange(tempText.Titles);
                    extractedTexts.Contents.AddRange(tempText.Contents);
                    foreach (var line in extractedImageTexts.Titles)
                    {
                        extractedTexts.Titles.Add(new KeyValuePair <int, string>(line.Key, line.Value));
                    }
                    foreach (var line in extractedImageTexts.Contents)
                    {
                        extractedTexts.Contents.Add(new KeyValuePair <int, string>(line.Key, line.Value));
                    }
                    foreach (var line in extractedImageTexts.TitleRanges)
                    {
                        extractedTexts.TitleRanges.Add(new KeyValuePair <int, int>(line.Key, line.Value));
                    }
                }
                else
                {
                    foreach (var line in extractedImageTexts.Titles)
                    {
                        extractedTexts.Titles.Add(new KeyValuePair <int, string>(line.Key, line.Value));
                    }
                    foreach (var line in extractedImageTexts.Contents)
                    {
                        extractedTexts.Contents.Add(new KeyValuePair <int, string>(line.Key, line.Value));
                    }
                    foreach (var line in extractedImageTexts.TitleRanges)
                    {
                        extractedTexts.TitleRanges.Add(new KeyValuePair <int, int>(line.Key, line.Value));
                    }
                    ExtractedAuditLetterText tempText = PdfExtractor.Utilities.ConvertToExtractedAuditLetterTexts(txtFormattedLines);
                    extractedTexts.Titles.AddRange(tempText.Titles);
                    extractedTexts.Contents.AddRange(tempText.Contents);
                }
            }

            string serialzedAuditLetterText = extractedTexts.SerializeExtractedAuditLetterText();

            extractedAuditLetterText = extractedTexts;

            return(serialzedAuditLetterText);
        }
Exemple #2
0
        static void Main(string[] args)
        {
            //Console.WriteLine("\u0033");
            //Console.WriteLine("\u0055");
            //Console.WriteLine("\u004C");
            //Console.WriteLine("\u0051");
            //Console.WriteLine("\u0046");
            //Console.WriteLine("\u004C");
            //Console.WriteLine("\u0053");

            //string filePathKey = @"C:\Users\t-holu\Documents\Visual Studio 2015\Projects\ConsoleApplication1\ConsoleApplication1\data\testKey.txt";
            string filePath = @"C:\Users\t-holu\Documents\Visual Studio 2015\Projects\ConsoleApplication1\ConsoleApplication1\data\test4.pdf";
            //string filePathDeco = @"C:\Users\t-holu\Documents\Visual Studio 2015\Projects\ConsoleApplication1\ConsoleApplication1\data\testDecode.txt";
            string fileFold = @"C:\Users\t-holu\Documents\AuditLetter\JixiProjectData\";

            string fileTime = @"C:\Users\t-holu\Documents\AuditLetter\JixiProjectData\comTime.txt";

            string fileocr = @"C:\Users\t-holu\Documents\AuditLetter\JixiProjectData\ocr.txt";

            string filesharp = @"C:\Users\t-holu\Documents\AuditLetter\JixiProjectData\sharp.txt";


            byte[] fileIn = FileToByteArray(fileFold + 3 + ".pdf");

            AuditLetterExtractor     doc = new AuditLetterExtractor();
            ExtractedAuditLetterText extractedAuditLetterText = new ExtractedAuditLetterText();



            for (int i = 130; i < 131; i++)
            {
                byte[]   filebytes = FileToByteArray(fileFold + 50 + ".pdf");
                DateTime dt1       = System.DateTime.Now;
                //var test1 = AuditLetterExtractor.ExtractTextFromRawPdf(filebytes, out extractedAuditLetterText);
                DateTime      dt2 = System.DateTime.Now;
                TimeSpan      ts  = dt2.Subtract(dt1);
                StringBuilder sb  = new StringBuilder();
                //Console.WriteLine(test1);
                sb.Append(i + "    Ocr:     " + ts.TotalMilliseconds.ToString() + " ");
                //File.WriteAllText(fileocr, test1);
                dt1 = System.DateTime.Now;
                try {
                    ExtractTextFromRawPdf(filebytes, out extractedAuditLetterText, ts, dt1, i);
                } catch (Exception e) {
                }
                var test = ExtractTextFromRawPdf(filebytes, out extractedAuditLetterText, ts, dt1, i);
                //Console.WriteLine(test);
                dt2 = System.DateTime.Now;
                ts  = dt2.Subtract(dt1);
                File.WriteAllText(filesharp, test);
                sb.Append("Reader:     " + ts.TotalMilliseconds.ToString() + "\n");
                Console.WriteLine("finish");
                //File.AppendAllText(fileTime, sb.ToString());
            }

            //PdfExtractor.PdfInfoExtractorPdfSharp document = new PdfInfoExtractorPdfSharp();
            //var test4 = document.ExtractText(fileIn);

            //pdfRead.pdfObject.PdfDocument doc = new pdfRead.pdfObject.PdfDocument(fileIn);
            ////var test = doc.pages;
            ////var test2 = test[2];
            ////var test3 = test2.Contents.CreateSingleContent();
            ////Console.WriteLine(test3.Stream);
            //var test2 = doc.ExtractImages();
            //Console.WriteLine(test2);
            //var test = doc.ExtractImages();
            //List<PdfTextLine> textLines;
            //for(int i = 0; i < doc.pages.Count; i++)
            //    textLines = doc.PageTextLine(i);
            //var t = doc.lineFontSize;
            //foreach(var i in t) {
            //    Console.WriteLine(i.Key + "hello world");
            //    foreach(var j in i.Value) {
            //        File.AppendAllText(@"C:\Users\t-holu\Documents\Visual Studio 2015\Projects\ConsoleApplication1\ConsoleApplication1\data\testwrite.txt", j.text);
            //        Console.WriteLine(j.text + "\n");
            //    }
            //}

            //StringBuilder sb = new StringBuilder();
            //foreach(var line in textLines) {
            //    sb.Append(line.text);
            //}
            //File.WriteAllText(@"C:\Users\t-holu\Documents\Visual Studio 2015\Projects\ConsoleApplication1\ConsoleApplication1\data\testRead.pdf", sb.ToString());
            //Console.WriteLine(sb.ToString());

            //PdfDocument document = PdfReader.Open(filePath, PdfDocumentOpenMode.ReadOnly);
            //var page = document.Pages;
            //var resources = page[0].Elements.GetDictionary("/Resources");
            //var xObjects = resources.Elements.GetDictionary("/XObject");
            //if(xObjects != null) {
            //    var items = xObjects.Elements.Values;
            //    foreach(PdfItem item in items) {
            //        var reference = item as PdfReference;
            //        if(reference != null) {
            //            var xObject = reference.Value as PdfDictionary;
            //            if(xObject.IsImage()) {

            //                var lines = ExtractLinesFromImage(xObject.ToImage());
            //                var im = xObject.ToImage();
            //                foreach(var line in lines) {

            //                    Console.WriteLine(string.Join(" ", line.Words.Select(r => r.Text)));
            //                }
            //            }
            //        }
            //    }
            //}

            //PdfReference[] irefs = document._irefTable.AllReferences;
            //var image = irefs[5].Value as PdfDictionary;
            //var imagebytes = image.Stream.Value;
            ////Console.WriteLine(image);
            //var test = page[0].Contents.CreateSingleContent();
            ////PdfContent content = page.Contents.CreateSingleContent();
            //byte[] bytes = test.Stream.Value;
            //Console.WriteLine(Encoding.ASCII.GetString(bytes));
            //var test2 = test.Stream;
            ////Console.WriteLine(test2.ToString());
            ////Console.WriteLine(test.ToString());
            //byte[] key = File.ReadAllBytes(filePathKey);
            ////byte[] res = ZLibCompressor.DeCompress(testzip);
            //byte[] decrpt = new byte[25];
            //Array.Copy(key, 0, decrpt, 0, 16);
            //byte[] byteArray = System.Text.Encoding.ASCII.GetBytes("19100sAlT");
            //Array.Copy(byteArray, 0, decrpt, 16, 9);
            ////PdfStandardSecurityHandler securityHandler = document.SecurityHandler;
            ////var test = securityHandler._encryptionKey;
            //MD5 _md5 = new MD5CryptoServiceProvider();
            //byte[] testHash = SetHashKey(key, 191, 0);


            ////_md5.Initialize();
            //////Console.WriteLine(System.Text.Encoding.ASCII.GetString(decrpt));
            ////_md5.TransformFinalBlock(testHash, 0, testHash.Length);

            //////_md5.TransformFinalBlock(decrpt, 0, decrpt.Length);
            //////Console.WriteLine(System.Text.Encoding.ASCII.GetString(decrpt));
            ////byte[] hashRes = _md5.Hash;
            //byte[] decrp = File.ReadAllBytes(filePathDeco);

            //byte[] iv = new byte[16];
            //byte[] buff = new byte[decrp.Length - 16];
            //Array.Copy(decrp, 0, iv, 0, 16);
            //Array.Copy(decrp, 16, buff, 0, buff.Length);
            //Aes myAes = Aes.Create();
            //myAes.Key = testHash;
            //myAes.IV = iv;
            //byte[] stream = new byte[0];
            //int streamSize = 0;
            //ICryptoTransform decryptor = myAes.CreateDecryptor(myAes.Key, myAes.IV);
            //using(MemoryStream msDecrypt = new MemoryStream(buff)) {
            //    using(CryptoStream csDecrypt = new CryptoStream(msDecrypt, decryptor, CryptoStreamMode.Read)) {
            //        using (var reader = new BinaryReader(csDecrypt)) {
            //            byte[] tmp = reader.ReadBytes(256);
            //            while(tmp.Length > 0) {
            //                streamSize += tmp.Length;
            //                byte[] streamBuff = new byte[streamSize];
            //                if(stream.Length > 0)
            //                    Array.Copy(stream, 0, streamBuff, 0, stream.Length);
            //                Array.Copy(tmp, 0, streamBuff, stream.Length, tmp.Length);
            //                stream = new byte[streamSize];
            //                Array.Copy(streamBuff, 0, stream, 0, streamSize);
            //                tmp = reader.ReadBytes(256);
            //            }

            //        }

            //    }
            //}
            ////link = MyAES.Aes.Decrypt(buff, hashRes, MyAES.Aes.Mode.CBC, iv, MyAES.Aes.Padding.PKCS7);
            //stream = ZLibCompressor.DeCompress(stream);
            //Console.WriteLine(Encoding.ASCII.GetString(stream));
            //byte[] streamtest = Decrypt(decrp, testHash, 0);
            //stream = ZLibCompressor.DeCompress(streamtest);
            //Console.WriteLine(Encoding.ASCII.GetString(stream));
            //Console.WriteLine(stream);
            ////File.WriteAllText(@"C:\Users\t-holu\Documents\Visual Studio 2015\Projects\ConsoleApplication1\ConsoleApplication1\data\decompressResult.txt", System.Text.Encoding.ASCII.GetString(res));



            Console.ReadKey();
        }