示例#1
0
        public bool PdfContains(string text)
        {
            var buf = new StringBuilder();

            for (var page = 1; page <= pdfReader.NumberOfPages; page++)
            {
                var streamBytes = pdfReader.GetPageContent(1);
                var tokenizer   = new PrTokeniser(new RandomAccessFileOrArray(streamBytes));

                var stringsList = new List <string>();
                while (tokenizer.NextToken())
                {
                    if (tokenizer.TokenType == PrTokeniser.TK_STRING)
                    {
                        stringsList.Add(tokenizer.StringValue);
                        _output.WriteLine(stringsList.Last());

                        var currentText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.UTF8, Encoding.UTF8, Encoding.UTF8.GetBytes(tokenizer.StringValue)));
                        _output.WriteLine(currentText);
                    }
                }
                tokenizer.Close();
            }

            pdfReader.Close();

            //if (stringsList.Contains(text))
            //    eturn true;

            return(false);
        }
        /// <summary>
        /// Unescapes an URL. All the "%xx" are replaced by the 'xx' hex char value.
        /// </summary>
        /// <param name="src">the url to unescape</param>
        /// <returns>the eunescaped value</returns>
        public static string UnEscapeUrl(string src)
        {
            StringBuilder bf = new StringBuilder();

            char[] s = src.ToCharArray();
            for (int k = 0; k < s.Length; ++k)
            {
                char c = s[k];
                if (c == '%')
                {
                    if (k + 2 >= s.Length)
                    {
                        bf.Append(c);
                        continue;
                    }
                    int a0 = PrTokeniser.GetHex(s[k + 1]);
                    int a1 = PrTokeniser.GetHex(s[k + 2]);
                    if (a0 < 0 || a1 < 0)
                    {
                        bf.Append(c);
                        continue;
                    }
                    bf.Append((char)(a0 * 16 + a1));
                    k += 2;
                }
                else
                {
                    bf.Append(c);
                }
            }
            return(bf.ToString());
        }
示例#3
0
        public IActionResult PostearDoc(IFormFile file)
        {
            string texto;

            if (file.ContentType == "application/pdf")
            {
                using (MemoryStream ms = new MemoryStream())
                {
                    file.CopyTo(ms);
                    byte[]    que       = ms.ToArray();
                    PdfReader pdfReader = new PdfReader(que);

                    byte[] contenidoPageUno = pdfReader.GetPageContent(1);

                    PrTokeniser tokenizer = new PrTokeniser(new RandomAccessFileOrArray(contenidoPageUno));

                    List <string> strList = new List <string>();
                    texto = String.Empty;

                    while (tokenizer.NextToken())
                    {
                        if (tokenizer.TokenType == PrTokeniser.TK_STRING)
                        {
                            strList.Add(tokenizer.StringValue);
                            texto = texto + tokenizer.StringValue;
                        }
                    }

                    pdfReader.Close();
                }
            }
            else
            {
                XWPFDocument      doc       = new XWPFDocument(file.OpenReadStream());
                XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
                texto = extractor.Text;
            }



            return(Json(new { texto }));
        }
        public void Verify_Issue42_CanBe_Processed()
        {
            var inPdfFile = TestUtils.GetPdfsPath("issue42.pdf");
            var reader    = new PdfReader(inPdfFile);

            var content   = reader.GetPageContent(1);
            var tokenizer = new PrTokeniser(new RandomAccessFileOrArray(content));

            var stringsList = new List <string>();

            while (tokenizer.NextToken())
            {
                if (tokenizer.TokenType == PrTokeniser.TK_STRING)
                {
                    stringsList.Add(tokenizer.StringValue);
                }
            }

            reader.Close();
            Assert.IsTrue(stringsList.Contains("demonstration"));
        }
        public void Test_Extract_Text()
        {
            var pdfFile = createSamplePdfFile();
            var reader  = new PdfReader(pdfFile);

            var streamBytes = reader.GetPageContent(1);
            var tokenizer   = new PrTokeniser(new RandomAccessFileOrArray(streamBytes));

            var stringsList = new List <string>();

            while (tokenizer.NextToken())
            {
                if (tokenizer.TokenType == PrTokeniser.TK_STRING)
                {
                    stringsList.Add(tokenizer.StringValue);
                }
            }

            reader.Close();

            Assert.IsTrue(stringsList.Contains("Hello DNT!"));
        }
示例#6
0
        public static string Extract(string path)
        {
            var reader = new PdfReader(path);
            var sb     = new StringBuilder();

            for (int i = 1; i <= reader.NumberOfPages; i++)
            {
                var streamBytes = reader.GetPageContent(i);
                var tokenizer   = new PrTokeniser(new RandomAccessFileOrArray(streamBytes));

                while (tokenizer.NextToken())
                {
                    if (tokenizer.TokenType == PrTokeniser.TK_STRING)
                    {
                        var currentText = tokenizer.StringValue;
                        currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
                        sb.Append(tokenizer.StringValue + " ");
                    }
                }
            }
            return(sb.ToString());
        }
示例#7
0
        public FileResult ReadPdf()
        {
            //获取中文字体,第三个参数表示为是否潜入字体,但只要是编码字体就都会嵌入。
            BaseFont baseFont = BaseFont.CreateFont(@"C:\Windows\Fonts\simsun.ttc,1", BaseFont.IDENTITY_H, BaseFont.NOT_EMBEDDED);
            //读取模板文件
            //PdfReader reader = new PdfReader(@"G:\简版征信PDF样本\陈贵年 2019.08.06个人信用报告.pdf");
            PdfReader reader = new PdfReader(@"G:\11.pdf");

            //创建文件流用来保存填充模板后的文件
            System.IO.MemoryStream stream = new System.IO.MemoryStream();

            PdfStamper stamp = new PdfStamper(reader, stream);
            //设置表单字体,在高版本有用,高版本加入这句话就不会插入字体,低版本无用
            //stamp.AcroFields.AddSubstitutionFont(baseFont);

            AcroFields form        = stamp.AcroFields;
            var        blankPages  = 0;
            var        streamBytes = reader.GetPageContent(1);
            var        tokenizer   = new PrTokeniser(new RandomAccessFileOrArray(streamBytes));

            var stringsList = new List <string>();

            for (var pageNum = 1; pageNum <= reader.NumberOfPages; pageNum++)
            {
                // first check, examine the resource dictionary for /Font or /XObject keys.
                // If either are present -> not blank.
                var pageDict = reader.GetPageN(pageNum);
                var resDict  = (PdfDictionary)pageDict.Get(PdfName.Resources);

                var hasFont = resDict.Get(PdfName.Font) != null;
                if (hasFont)
                {
                    var fonts = resDict.GetAsString(PdfName.Font);
                    Console.WriteLine($"Page {pageNum} has font(s).");
                    continue;
                }

                var hasImage = resDict.Get(PdfName.Xobject) != null;
                if (hasImage)
                {
                    Console.WriteLine($"Page {pageNum} has image(s).");
                    continue;
                }

                var content = reader.GetPageContent(pageNum);
                if (content.Length <= 20)
                {
                    Console.WriteLine($"Page {pageNum} is blank");
                    blankPages++;
                }
            }
            //表单文本框是否锁定
            stamp.FormFlattening = true;
            var sb   = new StringBuilder();
            var cont = string.Empty;

            for (int i = 0; i < reader.NumberOfPages; i++)
            {
                var s = reader.GetPageContent(i);
                //取得每一页的字节数组,将每一个字节转换为字符,并将数组转换为字符串
                if (s != null)
                {
                    cont += Encoding.UTF8.GetString(s);
                    for (int j = 0; j < s.Length; j++)
                    {
                        sb.Append(Convert.ToChar(s[j]));
                    }
                }
            }
            var tt  = sb.ToString();
            var sbb = new StringBuilder();
            var sr  = stream.ToArray();

            for (int j = 0; j < sr.Length; j++)
            {
                sbb.Append(Convert.ToChar(sr[j]));
            }
            var ss = sbb.ToString();

            //按顺序关闭io流


            reader.Close();

            var x  = Encoding.BigEndianUnicode.GetString(stream.GetBuffer());
            var x1 = Encoding.Unicode.GetString(stream.GetBuffer());
            var x2 = Encoding.ASCII.GetString(stream.GetBuffer());
            var x3 = Encoding.Default.GetString(stream.GetBuffer());
            //生成文件
            FileResult fileResult = new FileContentResult(stream.ToArray(), "application/pdf");
            var        t          = reader.GetType();

            //fileResult.FileDownloadName = "4.pdf";
            return(fileResult);
        }
示例#8
0
        private void ExtractData()
        {
            var state       = State.Stop;
            var reader      = new PdfReader(_pdfReceipt.OpenReadStream());
            var streamBytes = reader.GetPageContent(1);
            var tokenizer   = new PrTokeniser(new RandomAccessFileOrArray(streamBytes));

            int num            = 1;
            var pdfReceiptItem = new PdfReceiptItemDto();

            bool isFirstWordTaken = false;

            while (tokenizer.NextToken())
            {
                if (tokenizer.TokenType == PrTokeniser.TK_STRING)
                {
                    var currentText = tokenizer.StringValue;
                    currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default,
                                                                                Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
                    Console.WriteLine(currentText);

                    //GetLocation
                    if (!isFirstWordTaken && !string.IsNullOrWhiteSpace(currentText))
                    {
                        _location        = currentText;
                        isFirstWordTaken = true;
                    }

                    //GetDate
                    if (currentText.Contains("Dato:"))
                    {
                        var formatted = GetFormattedDateString(currentText.Remove(0, 6));
                        _date = DateTime.Parse(formatted);
                    }

                    //GetReceiptItem
                    if (state == State.Start)
                    {
                        //navn 1 -> antal  2 -> pris 3 -> reset
                        switch (num)
                        {
                        case 1:
                            //name
                            pdfReceiptItem.Name = currentText;
                            if (currentText == "Rabat")
                            {
                                pdfReceiptItem.IsDiscount = true;
                            }
                            num++;
                            break;

                        case 2:
                            //antal
                            var countParsed = double.TryParse(currentText, out double count);

                            if (countParsed)
                            {
                                pdfReceiptItem.Count = (int)count;
                                num++;
                            }

                            break;

                        case 3:
                            var priceParsed = double.TryParse(currentText, out double price);

                            if (priceParsed)
                            {
                                pdfReceiptItem.Price = price;
                                _pdfReceiptItems.Add(pdfReceiptItem);
                                pdfReceiptItem = new PdfReceiptItemDto();
                            }
                            num = 1;
                            break;
                        }
                    }

                    //setState
                    if (currentText == "Pris")
                    {
                        state = State.Start;
                    }

                    //end
                    else if (currentText == "I alt inkl. moms")
                    {
                        break;
                    }
                }
            }
        }
示例#9
0
        /// <summary>
        /// Uses <see cref="iTextSharp"/> library to extract plaintext
        /// from pdf file.
        /// </summary>
        /// <param name="pathToPdf">Path to PDF file</param>
        /// <returns>Plaintext string</returns>
        public static string GetPlainText(string pathToPdf)
        {
            var pdf = new PdfReader(pathToPdf);
            //string builder for output
            var sb = new StringBuilder();

            //go page-by-page
            for (var i = 1; i < pdf.NumberOfPages; i++)
            {
                var streamBytes = pdf.GetPageContent(i);
                var tokeniser   = new PrTokeniser(new RandomAccessFileOrArray(streamBytes));

                while (tokeniser.NextToken())
                {
                    switch (tokeniser.TokenType)
                    {
                    //string tokens seem to encompass everything we're interested in
                    case PrTokeniser.TK_STRING:
                        sb.Append(tokeniser.StringValue);
                        break;

                    //todo:find consistent way of parsing newlines
                    //newline tokens. Seem to be inconsistent, hacky to
                    //add new cases as seen but works for now.
                    case PrTokeniser.TK_NUMBER:
                        if (tokeniser.StringValue.Equals("-1.159"))
                        {
                            sb.Append(Environment.NewLine);
                        }

                        break;

                    case PrTokeniser.TK_OTHER:
                        if (tokeniser.StringValue.Equals("BDC"))
                        {
                            sb.Append(Environment.NewLine);
                        }

                        break;

                    // // these are apparently the newline tokens. Results in LOTS of newlines which breaks parsing
                    //    switch (tokeniser.StringValue)
                    //    {
                    //       // case "ET":
                    //        case "TD":
                    //        case "Td":
                    //        //case "Tm":
                    //        //case "T*":
                    //            //sb.Append(Environment.NewLine);
                    //            sb.Append($"[{tokeniser.StringValue}]");
                    //            break;
                    //        default:
                    //            break;
                    //    }

                    //    break;

                    // ReSharper disable once RedundantEmptySwitchSection - keep for debugging
                    default:
                        //if (Debugger.IsAttached) { sb.Append($"[{tokeniser.TokenType}-{tokeniser.StringValue}]"); }
                        break;
                    }
                }
                // add newline between pages
                sb.AppendLine();

                //ignore latter sections
                if (sb.ToString().Contains("ANNEX II"))
                {
                    break;
                }
            }

            //close reader
            pdf.Close();
            return(sb.ToString());
        }