示例#1
0
        public bool PdfContains(string text)
        {
            var buf = new StringBuilder();

            for (var page = 1; page <= pdfReader.NumberOfPages; page++)
            {
                var streamBytes = pdfReader.GetPageContent(1);
                var tokenizer   = new PrTokeniser(new RandomAccessFileOrArray(streamBytes));

                var stringsList = new List <string>();
                while (tokenizer.NextToken())
                {
                    if (tokenizer.TokenType == PrTokeniser.TK_STRING)
                    {
                        stringsList.Add(tokenizer.StringValue);
                        _output.WriteLine(stringsList.Last());

                        var currentText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.UTF8, Encoding.UTF8, Encoding.UTF8.GetBytes(tokenizer.StringValue)));
                        _output.WriteLine(currentText);
                    }
                }
                tokenizer.Close();
            }

            pdfReader.Close();

            //if (stringsList.Contains(text))
            //    eturn true;

            return(false);
        }
示例#2
0
        public IActionResult PostearDoc(IFormFile file)
        {
            string texto;

            if (file.ContentType == "application/pdf")
            {
                using (MemoryStream ms = new MemoryStream())
                {
                    file.CopyTo(ms);
                    byte[]    que       = ms.ToArray();
                    PdfReader pdfReader = new PdfReader(que);

                    byte[] contenidoPageUno = pdfReader.GetPageContent(1);

                    PrTokeniser tokenizer = new PrTokeniser(new RandomAccessFileOrArray(contenidoPageUno));

                    List <string> strList = new List <string>();
                    texto = String.Empty;

                    while (tokenizer.NextToken())
                    {
                        if (tokenizer.TokenType == PrTokeniser.TK_STRING)
                        {
                            strList.Add(tokenizer.StringValue);
                            texto = texto + tokenizer.StringValue;
                        }
                    }

                    pdfReader.Close();
                }
            }
            else
            {
                XWPFDocument      doc       = new XWPFDocument(file.OpenReadStream());
                XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
                texto = extractor.Text;
            }



            return(Json(new { texto }));
        }
        public void Verify_Issue42_CanBe_Processed()
        {
            var inPdfFile = TestUtils.GetPdfsPath("issue42.pdf");
            var reader    = new PdfReader(inPdfFile);

            var content   = reader.GetPageContent(1);
            var tokenizer = new PrTokeniser(new RandomAccessFileOrArray(content));

            var stringsList = new List <string>();

            while (tokenizer.NextToken())
            {
                if (tokenizer.TokenType == PrTokeniser.TK_STRING)
                {
                    stringsList.Add(tokenizer.StringValue);
                }
            }

            reader.Close();
            Assert.IsTrue(stringsList.Contains("demonstration"));
        }
        public void Test_Extract_Text()
        {
            var pdfFile = createSamplePdfFile();
            var reader  = new PdfReader(pdfFile);

            var streamBytes = reader.GetPageContent(1);
            var tokenizer   = new PrTokeniser(new RandomAccessFileOrArray(streamBytes));

            var stringsList = new List <string>();

            while (tokenizer.NextToken())
            {
                if (tokenizer.TokenType == PrTokeniser.TK_STRING)
                {
                    stringsList.Add(tokenizer.StringValue);
                }
            }

            reader.Close();

            Assert.IsTrue(stringsList.Contains("Hello DNT!"));
        }
示例#5
0
        public static string Extract(string path)
        {
            var reader = new PdfReader(path);
            var sb     = new StringBuilder();

            for (int i = 1; i <= reader.NumberOfPages; i++)
            {
                var streamBytes = reader.GetPageContent(i);
                var tokenizer   = new PrTokeniser(new RandomAccessFileOrArray(streamBytes));

                while (tokenizer.NextToken())
                {
                    if (tokenizer.TokenType == PrTokeniser.TK_STRING)
                    {
                        var currentText = tokenizer.StringValue;
                        currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
                        sb.Append(tokenizer.StringValue + " ");
                    }
                }
            }
            return(sb.ToString());
        }
示例#6
0
        private void ExtractData()
        {
            var state       = State.Stop;
            var reader      = new PdfReader(_pdfReceipt.OpenReadStream());
            var streamBytes = reader.GetPageContent(1);
            var tokenizer   = new PrTokeniser(new RandomAccessFileOrArray(streamBytes));

            int num            = 1;
            var pdfReceiptItem = new PdfReceiptItemDto();

            bool isFirstWordTaken = false;

            while (tokenizer.NextToken())
            {
                if (tokenizer.TokenType == PrTokeniser.TK_STRING)
                {
                    var currentText = tokenizer.StringValue;
                    currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default,
                                                                                Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
                    Console.WriteLine(currentText);

                    //GetLocation
                    if (!isFirstWordTaken && !string.IsNullOrWhiteSpace(currentText))
                    {
                        _location        = currentText;
                        isFirstWordTaken = true;
                    }

                    //GetDate
                    if (currentText.Contains("Dato:"))
                    {
                        var formatted = GetFormattedDateString(currentText.Remove(0, 6));
                        _date = DateTime.Parse(formatted);
                    }

                    //GetReceiptItem
                    if (state == State.Start)
                    {
                        //navn 1 -> antal  2 -> pris 3 -> reset
                        switch (num)
                        {
                        case 1:
                            //name
                            pdfReceiptItem.Name = currentText;
                            if (currentText == "Rabat")
                            {
                                pdfReceiptItem.IsDiscount = true;
                            }
                            num++;
                            break;

                        case 2:
                            //antal
                            var countParsed = double.TryParse(currentText, out double count);

                            if (countParsed)
                            {
                                pdfReceiptItem.Count = (int)count;
                                num++;
                            }

                            break;

                        case 3:
                            var priceParsed = double.TryParse(currentText, out double price);

                            if (priceParsed)
                            {
                                pdfReceiptItem.Price = price;
                                _pdfReceiptItems.Add(pdfReceiptItem);
                                pdfReceiptItem = new PdfReceiptItemDto();
                            }
                            num = 1;
                            break;
                        }
                    }

                    //setState
                    if (currentText == "Pris")
                    {
                        state = State.Start;
                    }

                    //end
                    else if (currentText == "I alt inkl. moms")
                    {
                        break;
                    }
                }
            }
        }
示例#7
0
        /// <summary>
        /// Uses <see cref="iTextSharp"/> library to extract plaintext
        /// from pdf file.
        /// </summary>
        /// <param name="pathToPdf">Path to PDF file</param>
        /// <returns>Plaintext string</returns>
        public static string GetPlainText(string pathToPdf)
        {
            var pdf = new PdfReader(pathToPdf);
            //string builder for output
            var sb = new StringBuilder();

            //go page-by-page
            for (var i = 1; i < pdf.NumberOfPages; i++)
            {
                var streamBytes = pdf.GetPageContent(i);
                var tokeniser   = new PrTokeniser(new RandomAccessFileOrArray(streamBytes));

                while (tokeniser.NextToken())
                {
                    switch (tokeniser.TokenType)
                    {
                    //string tokens seem to encompass everything we're interested in
                    case PrTokeniser.TK_STRING:
                        sb.Append(tokeniser.StringValue);
                        break;

                    //todo:find consistent way of parsing newlines
                    //newline tokens. Seem to be inconsistent, hacky to
                    //add new cases as seen but works for now.
                    case PrTokeniser.TK_NUMBER:
                        if (tokeniser.StringValue.Equals("-1.159"))
                        {
                            sb.Append(Environment.NewLine);
                        }

                        break;

                    case PrTokeniser.TK_OTHER:
                        if (tokeniser.StringValue.Equals("BDC"))
                        {
                            sb.Append(Environment.NewLine);
                        }

                        break;

                    // // these are apparently the newline tokens. Results in LOTS of newlines which breaks parsing
                    //    switch (tokeniser.StringValue)
                    //    {
                    //       // case "ET":
                    //        case "TD":
                    //        case "Td":
                    //        //case "Tm":
                    //        //case "T*":
                    //            //sb.Append(Environment.NewLine);
                    //            sb.Append($"[{tokeniser.StringValue}]");
                    //            break;
                    //        default:
                    //            break;
                    //    }

                    //    break;

                    // ReSharper disable once RedundantEmptySwitchSection - keep for debugging
                    default:
                        //if (Debugger.IsAttached) { sb.Append($"[{tokeniser.TokenType}-{tokeniser.StringValue}]"); }
                        break;
                    }
                }
                // add newline between pages
                sb.AppendLine();

                //ignore latter sections
                if (sb.ToString().Contains("ANNEX II"))
                {
                    break;
                }
            }

            //close reader
            pdf.Close();
            return(sb.ToString());
        }