Beispiel #1
0
        public static string GenerateCalendar(string rawInput)
        {
            ValidateInput(rawInput);
            var daysList = PdfParser.GetDaysList(rawInput);
            var events   =
                daysList.SelectMany(day => day.GetLessonTexts()).Select(text => text.ToCalendarEvent());

            var calendar = new StringBuilder();

            calendar.AppendLine("BEGIN:VCALENDAR");
            calendar.AppendLine("VERSION:2.0");
            calendar.AppendLine("PRODID:Schedule_generated_with_itext7");
            calendar.AppendLine("CALSCALE:GREGORIAN");

            events.ToList().ForEach(ev =>
            {
                calendar.AppendLine("BEGIN:VEVENT");
                calendar.AppendLine("DTSTAMP:" + FormatDateTime(ev.TimeStamp));
                calendar.AppendLine("DTSTART:" + FormatDateTime(ev.Start));
                calendar.AppendLine("DTEND:" + FormatDateTime(ev.End));
                calendar.AppendLine("SUMMARY:" + ev.Summary);
                calendar.AppendLine("DESCRIPTION:" + ev.Description);
                calendar.AppendLine("LOCATION:" + ev.Location);
                calendar.AppendLine("UID:" + ev.Uid);
                calendar.AppendLine("END:VEVENT");
            });

            calendar.AppendLine("END:VCALENDAR");
            return(calendar.ToString());
        }
        internal static string GetFileContent(IFileInfo file)
        {
            try
            {
                string extension = Path.GetExtension(file.FileName);

                if (extension == ".pdf")
                {
                    var fileContent = FileManager.Instance.GetFileContent(file);
                    if (fileContent != null)
                    {
                        return(PdfParser.ReadPdfFile(fileContent));
                    }
                }
                else if (extension == ".txt")
                {
                    var fileContent = FileManager.Instance.GetFileContent(file);
                    if (fileContent != null)
                    {
                        using (var reader = new StreamReader(fileContent, Encoding.UTF8))
                        {
                            return(reader.ReadToEnd());
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                string filename = file == null ? "unknown filename. IFileInfo is null." : file.FileName;
                Log.Logger.WarnFormat("Ignoring file [{0}]. Failed reading content. Error: {1}", filename, ex.Message);
            }
            return("");
        }
Beispiel #3
0
        public void HandleOpenPdf(string filePath)
        {
            parser = new PdfParser(filePath);
            string contentPDF = parser.GetFileContent();

            tbFileContent.Text = contentPDF;
        }
Beispiel #4
0
        private static string GetToolCode(string file)
        {
            IPdfParser parser = new PdfParser();
            var        rect   = new iTextSharp.text.Rectangle(296, 722, 323, 728);

            return(parser.GetStringValueFromRegion(file, rect));
        }
Beispiel #5
0
        public void GetDaysListTest()
        {
            var expected = _exampleDaysList;
            var result   = PdfParser.GetDaysList(ExampleRawInput);

            Assert.AreEqual(expected, result);
        }
Beispiel #6
0
        public void ExtractDateFromDayStringItemSuccessTest()
        {
            var dayStringItem =
                "Data Zajęć: 2019-10-04 piątek 17:30 19:00 2h00m dr Name Surname Advanced Math Wyk W/1/Web F Toronto Egzamin 19:15 20:45 2h00m doc. dr John Smiths Modern History of Poland Wyk W/1/Web F Praga Egzamin";
            var result = PdfParser.ExtractDateFromDayStringItem(dayStringItem);

            Assert.AreEqual("Data Zajęć: 2019-10-04 piątek", result);
        }
        public void GetText()
        {
            var pdfParser = new PdfParser();

            var text = pdfParser.GetText("PdfFiles/1.pdf");

            Assert.AreEqual(text, "atish kumar dipongkor");
        }
Beispiel #8
0
        public void RawTextToWordsTest()
        {
            var expectedOutput = new[] { "Some", "example", "input", "text" };
            var input          = expectedOutput[0] + " " + expectedOutput[1] + "\n" + expectedOutput[2] + " " +
                                 expectedOutput[3];
            var result = PdfParser.RawTextToWords(input);

            Assert.AreEqual(expectedOutput, result);
        }
Beispiel #9
0
        public void ExtractDateFromStringItemNotSuccessfulTest2()
        {
            var dayStringItem =
                "    Data Zajęć: 2019-10-04 piątek 17:30 19:00 2h00m dr Name Surname Advanced Math Wyk W/1/Web F Toronto Egzamin 19:15 20:45 2h00m doc. dr John Smiths Modern History of Poland Wyk W/1/Web F Praga Egzamin";
            var exception = Assert.Throws <ParsingException>(() =>
            {
                PdfParser.ExtractDateFromDayStringItem(dayStringItem);
            });

            Assert.That(exception.Message, Is.EqualTo(ParsingException.IndexOfMatchedItemNotZero));
        }
Beispiel #10
0
        public void Parse_age_distribution_correctly(string file)
        {
            var pdfText            = LoadResource(file);
            var expectedDictionary = GetExpectedAgeDistributionForFile(file);
            var distribution       = PdfParser.ParseDistributionByAge(pdfText);

            expectedDictionary.Keys.ShouldBe(distribution.Keys);

            foreach (var key in expectedDictionary.Keys)
            {
                distribution[key].ShouldBe(expectedDictionary[key], $"{key} does not match");
            }
        }
        public List <Dictionary <string, object> > GetAllForminfo(string formid)
        {
            var pdffile = Path.Combine(pdfroot, formid + ".pdf");

            if (System.IO.File.Exists(pdffile))
            {
                var pdfDoc = new PdfParser(pdffile);
                return(pdfDoc.GetFieldsInfo());
            }
            else
            {
                return(null);
            }
        }
        public int GetAllForms(string formid)
        {
            var pdffile = Path.Combine(pdfroot, formid + ".pdf");

            if (System.IO.File.Exists(pdffile))
            {
                var pdfDoc = new PdfParser(pdffile);
                return(pdfDoc.GetPageNumber());
            }
            else
            {
                return(-1);
            }
        }
Beispiel #13
0
        public void DayStringsToDayItemsTest()
        {
            var dayStringItems = new List <string>
            {
                "Data Zajęć: 2019-10-04 piątek 17:30 19:00 2h00m dr Name Surname Advanced Math Wyk W/1/Web F Toronto Egzamin 19:15 20:45 2h00m doc. dr John Smiths Modern History of Poland Wyk W/1/Web F Praga Egzamin",
                "Data Zajęć: 2019-10-05 sobota 8:00 9:30 2h00m mgr Marry Smiths-Blue Chemistry Cw 12 Blue 2/IEN F Sztokholm Zaliczenie ocena 11:20 14:30 4h00m doc. dr John Black Physics Wyk W/2/W F Toronto Egzamin",
                "Data Zajęć: 2019-10-06 niedziela 8:00 9:30 2h00m mgr Jack Green Wzorce projektowe Lab lab/WebN F Montreal Zaliczenie ocena 9:40 11:10 2h00m mgr inż. Thomas Orange Biology and Geography Lab lab15/2/WebN F Los Angeles Zaliczenie ocena 14:40 16:10 2h00m dr George White Geology Lab lab51/22/WebN F San Francisco Zaliczenie ocena 16:20 17:50 2h00m mgr Richard White Basic Mathematics Lab lab22/22/WebN F Chicago Zaliczenie ocena 18:00 19:30 2h00m mgr David Smith Civil Engineering Lab lab3/3/WebN F Nowy Jork Zaliczenie ocena 19:40 21:10 2h00m mgr Jacob Brown Advanced Mathematics Konw konw/2/WebN F Toronto Zaliczenie ocena",
                "Data Zajęć: 2019-10-18 piątek 17:30 19:00 2h00m dr Jacob Brown Advanced Physics Wyk W/2/WebN F Toronto Egzamin 19:15 20:45 2h00m dr inż. Thomas Blue Geology Konw konw/2/WebN F Praga Zaliczenie ocena"
            };

            var expected = _exampleDaysList;
            var result   = PdfParser.DayStringsToDayItems(dayStringItems);

            Assert.True(expected.SequenceEqual(result));
        }
        public ActionResult <Stream> GetFile(string formid, int pageid)
        {
            //var configfile = Path.Combine(pdfroot,string.Format("{0}.{1}.json",formid,pageid));
            var db      = new DbHelper();
            var jsonObj = db.LoadFormData(formid, pageid);

            if (jsonObj.Count > 0)
            {
                var pdffile = Path.Combine(pdfroot, formid + ".pdf");
                var pdfDoc  = new PdfParser(pdffile);
                if (pageid > pdfDoc.GetPageNumber() || pageid < 1)
                {
                    return(null);
                }
                var newpdf      = Path.Combine(pdfroot, string.Format("{0}.{1}.pdf", formid, pageid));
                var downpdfname = string.Format("myform.{0}.{1}.pdf", formid, pageid);
                if (pdfDoc.PageToNewPdf(pageid, newpdf))
                {
                    var pdfDocForm = new PdfParser(newpdf);
                    if (pdfDocForm.GetFieldsInfo().Count < 1)
                    {
                        return(null);
                    }
                    // edit post content
                    pdfDocForm.SetFieldsValues(jsonObj, Path.Combine(pdfroot, downpdfname));
                    // edit finished
                }

                string localFilePath = Path.Combine(pdfroot, downpdfname);
                string fileName      = Path.GetFileName(localFilePath);
                //long fileSize = (new FileInfo(localFilePath)).Length;
                var dataBytes  = System.IO.File.ReadAllBytes(localFilePath);
                var dataStream = new MemoryStream(dataBytes);
                return(dataStream);

                // HttpResponseMessage response = new HttpResponseMessage(HttpStatusCode.OK);
                // response.Content = new StreamContent(dataStream);
                // response.Content.Headers.ContentDisposition = new System.Net.Http.Headers.ContentDispositionHeaderValue("attachment");
                // response.Content.Headers.ContentDisposition.FileName = fileName;
                // response.Content.Headers.ContentType = new MediaTypeHeaderValue("application/octet-stream");

                //return response;
            }
            else
            {
                return(null);
            }
        }
Beispiel #15
0
        public void WordsToStringDayItemsTest()
        {
            var expectedOutput = new List <string>
            {
                "Data Zajęć: 2019-10-04 piątek 17:30 19:00 2h00m dr Name Surname Advanced Math Wyk W/1/Web F Toronto Egzamin 19:15 20:45 2h00m doc. dr John Smiths Modern History of Poland Wyk W/1/Web F Praga Egzamin",
                "Data Zajęć: 2019-10-05 sobota 8:00 9:30 2h00m mgr Marry Smiths-Blue Chemistry Cw 12 Blue 2/IEN F Sztokholm Zaliczenie ocena 11:20 14:30 4h00m doc. dr John Black Physics Wyk W/2/W F Toronto Egzamin",
                "Data Zajęć: 2019-10-06 niedziela 8:00 9:30 2h00m mgr Jack Green Wzorce projektowe Lab lab/WebN F Montreal Zaliczenie ocena 9:40 11:10 2h00m mgr inż. Thomas Orange Biology and Geography Lab lab15/2/WebN F Los Angeles Zaliczenie ocena 14:40 16:10 2h00m dr George White Geology Lab lab51/22/WebN F San Francisco Zaliczenie ocena 16:20 17:50 2h00m mgr Richard White Basic Mathematics Lab lab22/22/WebN F Chicago Zaliczenie ocena 18:00 19:30 2h00m mgr David Smith Civil Engineering Lab lab3/3/WebN F Nowy Jork Zaliczenie ocena 19:40 21:10 2h00m mgr Jacob Brown Advanced Mathematics Konw konw/2/WebN F Toronto Zaliczenie ocena",
                "Data Zajęć: 2019-10-18 piątek 17:30 19:00 2h00m dr Jacob Brown Advanced Physics Wyk W/2/WebN F Toronto Egzamin 19:15 20:45 2h00m dr inż. Thomas Blue Geology Konw konw/2/WebN F Praga Zaliczenie ocena"
            };
            var wordsInput =
                "Czas od Czas do Liczba godzin Prowadzący Przedmiot Forma zaj. Grupy Sala Forma zaliczenia Uwagi Data Zajęć: 2019-10-04 piątek 17:30 19:00 2h00m dr Name Surname Advanced Math Wyk W/1/Web F Toronto Egzamin 19:15 20:45 2h00m doc. dr John Smiths Modern History of Poland Wyk W/1/Web F Praga Egzamin Data Zajęć: 2019-10-05 sobota 8:00 9:30 2h00m mgr Marry Smiths-Blue Chemistry Cw 12 Blue 2/IEN F Sztokholm Zaliczenie ocena 11:20 14:30 4h00m doc. dr John Black Physics Wyk W/2/W F Toronto Egzamin Data Zajęć: 2019-10-06 niedziela 8:00 9:30 2h00m mgr Jack Green Wzorce projektowe Lab lab/WebN F Montreal Zaliczenie ocena 9:40 11:10 2h00m mgr inż. Thomas Orange Biology and Geography Lab lab15/2/WebN F Los Angeles Zaliczenie ocena 14:40 16:10 2h00m dr George White Geology Lab lab51/22/WebN F San Francisco Zaliczenie ocena 16:20 17:50 2h00m mgr Richard White Basic Mathematics Lab lab22/22/WebN F Chicago Zaliczenie ocena 18:00 19:30 2h00m mgr David Smith Civil Engineering Lab lab3/3/WebN F Nowy Jork Zaliczenie ocena 19:40 21:10 2h00m mgr Jacob Brown Advanced Mathematics Konw konw/2/WebN F Toronto Zaliczenie ocena Data Zajęć: 2019-10-18 piątek 17:30 19:00 2h00m dr Jacob Brown Advanced Physics Wyk W/2/WebN F Toronto Egzamin 19:15 20:45 2h00m dr inż. Thomas Blue Geology Konw konw/2/WebN F Praga Zaliczenie ocena "
                .Split(" ");
            var result = PdfParser.WordsToStringDayItems(wordsInput);

            Assert.AreEqual(expectedOutput, result);
        }
        private async Task <List <SalesOrderListItemViewModel> > GetSalesOrdersAsync()
        {
            return(await Task.Factory.StartNew(() =>
            {
                var salesOrderAttachments = OutlookDataProvider.ExtractSalesOrderPdfs(
                    Settings.Default.NewOrderFolderName,
                    Path.GetTempPath());

                string orderExpr, buyerExpr, deliveryExpr, drawingExpr;
                byte[] logoBytes = null;

                using (var cpe = new CPEUnitOfWork())
                {
                    var customer = cpe.Customers.GetAll().FirstOrDefault(c => c.Name.Contains("E2V"));
                    orderExpr = customer.OrderNumberRegex;
                    buyerExpr = customer.BuyerRegex;
                    deliveryExpr = customer.DeliveryDateRegex;
                    drawingExpr = customer.DrawingNumberRegex;
                    logoBytes = customer.LogoBLOB;
                }

                var salesOrders = new List <SalesOrderListItemViewModel>();

                foreach (var attachment in salesOrderAttachments)
                {
                    var detail = PdfParser.ParseSalesOrderAsync(attachment.FileName, attachment.MailId, orderExpr,
                                                                deliveryExpr, buyerExpr, drawingExpr).Result;

                    var model = new SalesOrderListItemViewModel
                    {
                        Buyer = detail.Buyer, DeliveryDate = detail.DeliveryDate, DrawingNumber = detail.DrawingNumber, OrderNumber = detail.OrderNumber,
                        CompanyLogoBytes = logoBytes, FileName = detail.FileName, MailId = detail.MailId
                    };

                    salesOrders.Add(model);
                }

                return salesOrders;
            }));
        }
Beispiel #17
0
        static void Main(string[] args)
        {
            string inputFile  = args[0];
            string outputFile = args[1];
            int    pageNumber = Int32.Parse(args[2]);

            string target = Path.Combine(Path.GetDirectoryName(outputFile),
                                         Path.GetFileNameWithoutExtension(outputFile) + " - " + pageNumber.ToString("00000") + ".pdf");

            using (var inputStream = new FileStream(inputFile, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
            {
                var pdfParser = PdfParser.Parse(inputStream);

                using (Stream s = new FileStream(target, FileMode.Create, FileAccess.Write))
                {
                    var pdfMerger = new PdfMerger(s);
                    pdfMerger = new PdfMerger(s);
                    pdfMerger.Add(pdfParser, new int[] { pageNumber });
                    pdfMerger.Finish();
                }
            }
        }
Beispiel #18
0
        static int Main(string[] args)
        {
            ArgsParser ArgsParser = new ArgsParser();

            if (!ArgsParser.Validate(args))
            {
                return(0);
            }

            PdfReader PdfReader = new PdfReader(ArgsParser.GetBookPath());
            PdfParser PdfParser = new PdfParser(PdfReader);

            string result = "No action has been executed";

            if (ArgsParser.GetMode() == ArgsParser.Mode.Encode)
            {
                try {
                    Encoder Encoder = new Encoder(PdfParser);
                    result = string.Format("Encoded text: {0}", Encoder.Encode(ArgsParser.GetText()));
                } catch (EncodeException e) {
                    result = e.Message;
                } catch (Exception e) {
                    result = "Couldn't encode this string. ";
                }
            }
            if (ArgsParser.GetMode() == ArgsParser.Mode.Decode)
            {
                try {
                    Decoder Decoder = new Decoder(PdfParser);
                    result = string.Format("Decoded text: {0}", Decoder.Decode(ArgsParser.GetText()));
                } catch (Exception e) {
                    result = "Couldn't decode this string.";
                }
            }

            Console.WriteLine(result);

            return(1);
        }
Beispiel #19
0
        public async Task <Result <DailyPdfStats> > Handle(ParsePdfCommand request, CancellationToken cancellationToken)
        {
            await Task.FromResult(true);

            try
            {
                var result = new DailyPdfStats
                {
                    FileName = request.File.FileName
                };

                var buffer = new StringBuilder();
                using (var stream = request.File.OpenReadStream())
                {
                    var doc = new PdfDocument(stream);
                    foreach (PdfPageBase page in doc.Pages)
                    {
                        buffer.Append(page.ExtractText());
                    }
                }

                var pdfContents = buffer.ToString();
                result.NumberInfected = PdfParser.ParseNumberInfected(pdfContents);
                result.NumberDeceased = PdfParser.ParseNumberDeceased(pdfContents);
                result.NumberCured    = PdfParser.ParseNumberCured(pdfContents);
                result.AverageAge     = PdfParser.ParseAverageAge(pdfContents);

                result.DistributionByAge = PdfParser.ParseDistributionByAge(pdfContents);
                var parsedOn = PdfParser.TryParsePublishedDate(request.File.FileName);
                result.ParsedOn       = new DateTimeOffset(parsedOn).ToUnixTimeSeconds();
                result.ParsedOnString = parsedOn.ToString("yyyy-MM-dd");

                return(Result.Ok(result));
            }
            catch (Exception e)
            {
                return(Result.Failure <DailyPdfStats>($"{request.File.FileName} -- {e.Message}"));
            }
        }
        static void Main(string[] args)
        {
            string inputFile  = args[0];
            string outputFile = args[1];
            int    iterations = Int32.Parse(args[2]);

            // Merge the generated pdfs
            using (var outputStream = new FileStream(outputFile, FileMode.Create, FileAccess.Write))
            {
                var merger = new PdfMerger(outputStream);
                for (int count = 1; count <= iterations; count++)
                {
                    using (var inputStream = new FileStream(inputFile, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
                    {
                        var pdfParser = PdfParser.Parse(inputStream);
                        merger.Add(pdfParser, null);
                    }
                }

                merger.Finish();
            }
        }
Beispiel #21
0
        private static async Task <FoundationResult> ProcessFoundationAsync(string foundation, string shortName, int year, string host = null)
        {
            var downloadUri = AppDomain.CurrentDomain.BaseDirectory + @"\Data\" + $"{shortName}_{year}.pdf";

            var result = Crawler.FindFinancialStatement(foundation, year, host).ToArray();

            var jahresRechnungUrl = result.FirstOrDefault(r => r.AbsoluteUri.Contains("rechnung")) ?? result.FirstOrDefault();

            if (jahresRechnungUrl == null)
            {
                return(new FoundationResult {
                    Success = false
                });
            }

            if (!File.Exists(downloadUri)) // Don't download the file if we already have it
            {
                await Downloader.DownloadAsync(jahresRechnungUrl.AbsoluteUri, downloadUri);
            }

            try
            {
                var balanceSheetTotal = PdfParser.FindTotalActiva(downloadUri, year).GetValueOrDefault();

                return(new FoundationResult
                {
                    FinancialStatementUrl = jahresRechnungUrl,
                    BalanceSheetTotal = balanceSheetTotal,
                    Success = true
                });
            }
            catch (Exception ex)
            {
                Console.WriteLine("ProcessFoundation: {0}", ex);
                return(new FoundationResult {
                    Success = false
                });
            }
        }
Beispiel #22
0
        public void CreateTfIdf()
        {
            var tfIdf = new DocumentTermFrequency();

            var nGram     = new Ngram();
            var pdfParser = new PdfParser();

            var reports = new Dictionary <string, List <string> >();

            foreach (var file in Directory.EnumerateFiles("Pdf", "*.pdf"))
            {
                var fileName = Path.GetFileNameWithoutExtension(file);

                var contents = pdfParser.GetText(file);

                reports[fileName] = nGram.Create(contents, 3);
            }

            var result = tfIdf.Create(reports);

            Assert.AreEqual(result.GetLength(0), 2);
            Assert.AreEqual(result.GetLength(1), 7);
        }
        public List <Dictionary <string, object> > GetPage(string formid, int pageid, bool forcegenerate = false)
        {
            if (!forcegenerate)
            {
                var db  = new DbHelper();
                var ret = db.LoadFormData(formid, pageid);
                if (ret == null)
                {
                    ret = db.LoadFormConfig(formid, pageid);
                }
                if (ret != null)
                {
                    return(ret);
                }
            }

            var pdffile = Path.Combine(pdfroot, formid + ".pdf");
            var pdfDoc  = new PdfParser(pdffile);

            if (pageid > pdfDoc.GetPageNumber() || pageid < 1)
            {
                return(null);
            }
            var newpdf = Path.Combine(pdfroot, string.Format("{0}.{1}.pdf", formid, pageid));

            if (pdfDoc.PageToNewPdf(pageid, newpdf))
            {
                var result = new PdfParser(newpdf).GetFieldsInfo();
                var db     = new DbHelper();
                db.SavePdfFormData(formid, pageid, result);
                return(result);
            }
            else
            {
                return(null);
            }
        }
Beispiel #24
0
        public void ExtractLessonStringsFromDayStringItemsTest()
        {
            var dayStringItem =
                "Data Zajęć: 2019-10-06 niedziela 8:00 9:30 2h00m mgr Jack Green Wzorce projektowe Lab lab/WebN F Montreal Zaliczenie ocena " +
                "9:40 11:10 2h00m mgr inż. Thomas Orange Biology and Geography Lab lab15/2/WebN F Los Angeles Zaliczenie ocena " +
                "14:40 16:10 2h00m dr George White Geology Lab lab51/22/WebN F San Francisco Zaliczenie ocena " +
                "16:20 17:50 2h00m mgr Richard White Basic Mathematics Lab lab22/22/WebN F Chicago Zaliczenie ocena " +
                "18:00 19:30 2h00m mgr David Smith Civil Engineering Lab lab3/3/WebN F Nowy Jork Zaliczenie ocena " +
                "19:40 21:10 2h00m mgr Jacob Brown Advanced Mathematics Konw konw/2/WebN F Toronto Zaliczenie ocena";

            var expected = new List <string>
            {
                "8:00 9:30 2h00m mgr Jack Green Wzorce projektowe Lab lab/WebN F Montreal Zaliczenie ocena ",
                "9:40 11:10 2h00m mgr inż. Thomas Orange Biology and Geography Lab lab15/2/WebN F Los Angeles Zaliczenie ocena ",
                "14:40 16:10 2h00m dr George White Geology Lab lab51/22/WebN F San Francisco Zaliczenie ocena ",
                "16:20 17:50 2h00m mgr Richard White Basic Mathematics Lab lab22/22/WebN F Chicago Zaliczenie ocena ",
                "18:00 19:30 2h00m mgr David Smith Civil Engineering Lab lab3/3/WebN F Nowy Jork Zaliczenie ocena ",
                "19:40 21:10 2h00m mgr Jacob Brown Advanced Mathematics Konw konw/2/WebN F Toronto Zaliczenie ocena"
            };

            var result = PdfParser.ExtractLessonStringsFromDayStringItem(dayStringItem);

            Assert.AreEqual(expected, result);
        }
        private IParser GetParser(FileExtension fileType)
        {
            IParser parser = null;

            switch (fileType)
            {
            case FileExtension.Docx:
                parser = new DocxParser();
                break;

            case FileExtension.Odt:
                parser = new OdtParser();
                break;

            case FileExtension.Pdf:
                parser = new PdfParser();
                break;

            default:
                throw new Exception("Unknown file type");
            }

            return(parser);
        }
Beispiel #26
0
        public void TestPdfParser()
        {
            var sb    = new StringBuilder();
            var addrs = new List <int>();

            addrs.Add(0);
            sb.AppendLine("%PDF-1.4");
            sb.AppendLine("%ту╧╙");

            var page1ContentId = addStream(sb, addrs, $"Hello World");

            var page1Id = addObject(sb, addrs, $"<</Type/Page /Parent ~1~ 0 R/Contents {page1ContentId} 0 R>>");

            var pagesId = addObject(sb, addrs, $"<</Type/Pages /Kids[{page1Id} 0 R] /Count 1>>");

            var catalogId = addObject(sb, addrs, $"<</Type/Catalog /Pages {pagesId} 0 R>>");

            var xrefAddress = sb.Length;

            sb.AppendLine("xref");
            sb.AppendLine($"0 {addrs.Count}");
            sb.AppendLine("0000000000 65535 f");
            foreach (var address in addrs)
            {
                if (address == 0)
                {
                    continue;
                }

                sb.AppendLine($"{address:0000000000} 00000 n");
            }
            sb.AppendLine($"trailer<</Size {addrs.Count}/Root {catalogId} 0 R>>");
            sb.AppendLine("startxref");
            sb.AppendLine(xrefAddress.ToString());
            sb.AppendLine("%%EOF");
            var byteString = sb.ToString();
            var bytes      = new byte[byteString.Length];
            var bytesIndex = 0;

            foreach (var ch in byteString)
            {
                bytes[bytesIndex++] = (byte)ch;
            }

            var pdfParser = new PdfParser(bytes);

            Assert.AreEqual("1.4", pdfParser.PdfVersion);
            var trailer = pdfParser.Tokeniser.TrailerEntries;
            var root    = (DictionaryToken)trailer["Root"];

            Assert.AreEqual("Catalog", ((NameToken)root["Type"]).Value);
            var pages = (DictionaryToken)root["Pages"];

            Assert.AreEqual("Pages", ((NameToken)pages["Type"]).Value);
            var kids = (ArrayToken)pages["Kids"];

            foreach (var kid in kids)
            {
                var page = (DictionaryToken)kid;
                Assert.AreEqual("Page", ((NameToken)page["Type"]).Value);
                var pageContent = (DictionaryToken)page["Contents"];
            }
        }
Beispiel #27
0
        public void Parse_average_age_correctly(string file, string expectedMap)
        {
            var pdfText = LoadResource(file);

            PdfParser.ParseAverageAge(pdfText).ShouldBe(expectedMap);
        }
Beispiel #28
0
 private PdfOcrEngine(PdfParser parser, ExtractPdf extractPdf)
 {
     _parser     = parser;
     _extractPdf = extractPdf;
 }
Beispiel #29
0
 public Decoder(PdfParser PdfParser)
 {
     this.PdfParser = PdfParser;
 }
Beispiel #30
0
        private void navigate(bool isNext)
        {
            pdfRefRunTrace.Clear();
            BackStatusBarItem.Visibility = Visibility.Collapsed;

            //check if user has changed file or directory
            if (FileTextBox.Text != "" && fileString != FileTextBox.Text)
            {
                var fileInfo = new FileInfo(FileTextBox.Text);
                if (!fileInfo.Exists)
                {
                    MessageBox.Show($"Could not find file '{FileTextBox.Text}'.", "Pdf file not found");
                    return;
                }
                fileString            = FileTextBox.Text;
                directoryString       = fileInfo.DirectoryName;
                DirectoryTextBox.Text = directoryString;
                files.Clear();
                dirs.Clear();
                allFiles.Clear();
                currentFileIndex = 0;
                dirs.Push(new DirectoryInfo(directoryString));
                isShowStartFile = true;
            }
            else if (DirectoryTextBox.Text != "" && directoryString != DirectoryTextBox.Text)
            {
                var directoryInfo = new DirectoryInfo(DirectoryTextBox.Text);
                if (!directoryInfo.Exists)
                {
                    MessageBox.Show($"Could not find directory '{DirectoryTextBox.Text}'.", "Directory not found");
                    return;
                }
                directoryString = DirectoryTextBox.Text;
                files.Clear();
                dirs.Clear();
                allFiles.Clear();
                currentFileIndex = 0;
                dirs.Push(directoryInfo);
            }

            var haveAllFilesBeenFound = false;

            if (currentFileIndex < allFiles.Count - 1 || (isNext == false) || (files.Count == 0 && dirs.Count == 0))
            {
                //show already read files
                if (isNext)
                {
                    currentFileIndex++;
                    if (currentFileIndex >= allFiles.Count)
                    {
                        currentFileIndex = 0;
                    }
                }
                else
                {
                    currentFileIndex--;
                    if (currentFileIndex < 0)
                    {
                        currentFileIndex = allFiles.Count - 1;
                    }
                }
            }
            else
            {
                while (files.Count == 0 && !haveAllFilesBeenFound)
                {
                    if (dirs.Count == 0)
                    {
                        if (allFiles.Count == 0)
                        {
                            MessageBox.Show($"There are no pdf files in '{directoryString}' and its subdirectories.", "No pdf file found");
                            return;
                        }
                        else
                        {
                            currentFileIndex      = 0;
                            haveAllFilesBeenFound = true;
                        }
                    }
                    else
                    {
                        //read next directory
                        var dir = dirs.Pop();
                        foreach (var subDir in dir.GetDirectories().OrderByDescending(d => d.Name))
                        {
                            dirs.Push(subDir);
                        }

                        foreach (var subfile in dir.GetFiles("*.pdf"))
                        {
                            if (isShowStartFile)
                            {
                                if (subfile.FullName == fileString)
                                {
                                    isShowStartFile = false;
                                    files.Enqueue(subfile);
                                }
                                else
                                {
                                    allFiles.Add(subfile);
                                }
                            }
                            else
                            {
                                files.Enqueue(subfile);
                            }
                        }
                    }
                }

                if (!haveAllFilesBeenFound)
                {
                    currentFileIndex = allFiles.Count;
                    allFiles.Add(files.Dequeue());
                }
            }

            var file = allFiles[currentFileIndex];

            FileTextBox.Text = file.FullName;
            fileString       = FileTextBox.Text;
            PagesTabControl.Items.Clear();
            PdfParser pdfParser;

            try {
                pdfParser = new PdfParser(file.FullName, "|", streamBuffer, stringBuilder);
            } catch (Exception ex) {
                var pageTabItem = new TabItem {
                    Header = "E_xception"
                };
                var bytes = "";
                if (ex is PdfException pdfException)
                {
                    bytes = Environment.NewLine + Environment.NewLine + pdfException.Tokeniser.ShowBufferContent();
                }
                var textBox = new TextBox {
                    Text = ex.ToDetailString() + bytes,
                    VerticalScrollBarVisibility = ScrollBarVisibility.Auto,
                    IsReadOnly = true
                };
                pageTabItem.Content = textBox;
                PagesTabControl.Items.Add(pageTabItem);
                PagesTabControl.SelectedIndex = 0;
                return;
            }

            try {
                PdfWebBrowser.Visibility = Visibility.Visible;
                PdfTextBox.Visibility    = Visibility.Collapsed;
                var fileUri = new Uri(new Uri("file://"), file.FullName);
                PdfWebBrowser.Navigate(fileUri);
                //if (PdfWebBrowser.Source?.AbsolutePath!=file.FullName) {
                //  //couldn't find file
                //  PdfWebBrowser.Navigate(new Uri("about:blank"));
                //}
            } catch (Exception ex) {
                PdfWebBrowser.Visibility = Visibility.Collapsed;
                PdfTextBox.Visibility    = Visibility.Visible;
                PdfTextBox.Text          = ex.ToDetailString();
            }

            var pageIndex = 0;

            foreach (var page in pdfParser.Pages)
            {
                //todo: How to deal with pdf documents having more than 20 pages ?
                if (pageIndex > 20)
                {
                    break;
                }

                var hasException = false;
                var underline    = "";
                if (pageIndex < 10)
                {
                    underline = "_";
                }
                var pageTabItem = new TabItem {
                    Header = underline + pageIndex++
                };
                stringBuilder.Clear();
                var isFirstContent = true;
                foreach (var content in page.Contents)
                {
                    if (isFirstContent)
                    {
                        isFirstContent = false;
                    }
                    else
                    {
                        stringBuilder.AppendLine(new string('-', 80));
                    }
                    stringBuilder.AppendLine(content.Text);
                    if (content.Exception != null)
                    {
                        hasException = true;
                        stringBuilder.AppendLine(new string('+', 80));
                        stringBuilder.AppendLine(content.Exception);
                        stringBuilder.AppendLine(new string('+', 80));
                    }
                    if (content.Error != null)
                    {
                        hasException = true;
                        stringBuilder.AppendLine(new string('+', 80));
                        stringBuilder.AppendLine(content.Error);
                        stringBuilder.AppendLine(new string('+', 80));
                    }
                }

                if (page.Exception != null)
                {
                    hasException = true;
                    stringBuilder.AppendLine(new string('+', 80));
                    stringBuilder.AppendLine(page.Exception);
                    stringBuilder.AppendLine(new string('+', 80));
                }
                var textBox = new TextBox {
                    Text = stringBuilder.ToString(),
                    VerticalScrollBarVisibility   = ScrollBarVisibility.Auto,
                    HorizontalScrollBarVisibility = ScrollBarVisibility.Auto,
                    IsReadOnly = true
                };

                if (hasException)
                {
                    pageTabItem.Background = Brushes.Khaki;
                }
                pageTabItem.Content = textBox;
                PagesTabControl.Items.Add(pageTabItem);
            }

            var infoTabItem = new TabItem {
                Header = "_Info"
            };
            var tokeniser = pdfParser.Tokeniser;
            var infotext  = "PDF Version: " + tokeniser.PdfVersion;

            if (tokeniser.DocumentInfo != null)
            {
                infotext += Environment.NewLine + Environment.NewLine + "Document Info: " + tokeniser.DocumentInfo;
            }
            if (tokeniser.DocumentID != null)
            {
                infotext += Environment.NewLine + Environment.NewLine + "Document ID: " + tokeniser.DocumentID;
            }
            infotext += Environment.NewLine + Environment.NewLine + "Pages: " + tokeniser.Pages.Count;
            infotext += Environment.NewLine + Environment.NewLine + "Fonts: ";
            foreach (var objectId_Token in tokeniser.Tokens)
            {
                if (objectId_Token.Value is DictionaryToken objectDictionaryToken)
                {
                    if (objectDictionaryToken.Type == "Font")
                    {
                        var pdfFont = (PdfFont)objectDictionaryToken.PdfObject !;
                        infotext += Environment.NewLine + Environment.NewLine + pdfFont.Name + objectDictionaryToken.ToString();
                        if (pdfFont.ToUnicodeHeader != null)
                        {
                            infotext += Environment.NewLine + "ToUnicodeHeader: " + pdfFont.ToUnicodeHeader;
                        }
                        if (pdfFont.CMap != null)
                        {
                            foreach (var code_char in pdfFont.CMap)
                            {
                                infotext += Environment.NewLine + $"{code_char.Key}: '{code_char.Value}'";
                            }
                        }
                        if (pdfFont.Exception != null)
                        {
                            infotext += Environment.NewLine + new string('+', 80);
                            infotext += Environment.NewLine + pdfFont.Exception;
                            infotext += Environment.NewLine + new string('+', 80);
                            infoTabItem.Background = Brushes.Khaki;
                        }

                        infotext += Environment.NewLine;
                    }
                }
            }
            if (tokeniser.Metadata != null)
            {
                infotext += Environment.NewLine + Environment.NewLine + "Meta data: " + tokeniser.Metadata;
            }
            var textBoxInfo = new TextBox {
                Text = infotext,
                VerticalScrollBarVisibility   = ScrollBarVisibility.Auto,
                HorizontalScrollBarVisibility = ScrollBarVisibility.Auto,
                IsReadOnly = true
            };

            infoTabItem.Content = textBoxInfo;
            PagesTabControl.Items.Add(infoTabItem);

            var bytesTabItem = new TabItem {
                Header = "_Bytes"
            };
            //var bytesContextMenu = new ContextMenu();
            //bytesContextMenu.Items.Add(new MenuItem { Command = System.Windows.Input.ApplicationCommands.SelectAll});
            //bytesContextMenu.Items.Add(new MenuItem { Command = System.Windows.Input.ApplicationCommands.Copy});
            //bytesContextMenu.Items.Add(new MenuItem { Command = System.Windows.Input.ApplicationCommands.Cut });
            //bytesContextMenu.Items.Add(new MenuItem { Command = System.Windows.Input.ApplicationCommands.Paste });
            //bytesContextMenu.Items.Add(new MenuItem { Command = System.Windows.Input.ApplicationCommands.Undo });
            //bytesContextMenu.Items.Add(new MenuItem { Command = System.Windows.Input.ApplicationCommands.Redo });
            ///*
            //(int)Shortcut.CtrlS, Show Stream
            //(int)Shortcut.CtrlA, Select all
            //(int)Shortcut.CtrlC, Copy
            //(int)Shortcut.CtrlX, Cut
            //(int)Shortcut.CtrlV, Paste
            //Shortcut.CtrlZ, Undo
            //(int)Shortcut.CtrlY, Redo
            //*/

            //bytesTextBox = new TextBox {
            //  Text = pdfParser.Tokeniser.ShowBufferContent(),
            //  VerticalScrollBarVisibility = ScrollBarVisibility.Auto,
            //  HorizontalScrollBarVisibility = ScrollBarVisibility.Auto,
            //  ContextMenu = bytesContextMenu,
            //  IsReadOnly = true
            //  };
            var pdfSourceRichTextBox = new PdfSourceRichTextBox(pdfParser.Tokeniser, stringBuilder, this);

            bytesTabItem.Content = pdfSourceRichTextBox;
            PagesTabControl.Items.Add(bytesTabItem);

            PagesTabControl.SelectedIndex = 0;
        }