public bool PdfContains(string text) { var buf = new StringBuilder(); for (var page = 1; page <= pdfReader.NumberOfPages; page++) { var streamBytes = pdfReader.GetPageContent(1); var tokenizer = new PrTokeniser(new RandomAccessFileOrArray(streamBytes)); var stringsList = new List <string>(); while (tokenizer.NextToken()) { if (tokenizer.TokenType == PrTokeniser.TK_STRING) { stringsList.Add(tokenizer.StringValue); _output.WriteLine(stringsList.Last()); var currentText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.UTF8, Encoding.UTF8, Encoding.UTF8.GetBytes(tokenizer.StringValue))); _output.WriteLine(currentText); } } tokenizer.Close(); } pdfReader.Close(); //if (stringsList.Contains(text)) // eturn true; return(false); }
/// <summary> /// Unescapes an URL. All the "%xx" are replaced by the 'xx' hex char value. /// </summary> /// <param name="src">the url to unescape</param> /// <returns>the eunescaped value</returns> public static string UnEscapeUrl(string src) { StringBuilder bf = new StringBuilder(); char[] s = src.ToCharArray(); for (int k = 0; k < s.Length; ++k) { char c = s[k]; if (c == '%') { if (k + 2 >= s.Length) { bf.Append(c); continue; } int a0 = PrTokeniser.GetHex(s[k + 1]); int a1 = PrTokeniser.GetHex(s[k + 2]); if (a0 < 0 || a1 < 0) { bf.Append(c); continue; } bf.Append((char)(a0 * 16 + a1)); k += 2; } else { bf.Append(c); } } return(bf.ToString()); }
public IActionResult PostearDoc(IFormFile file) { string texto; if (file.ContentType == "application/pdf") { using (MemoryStream ms = new MemoryStream()) { file.CopyTo(ms); byte[] que = ms.ToArray(); PdfReader pdfReader = new PdfReader(que); byte[] contenidoPageUno = pdfReader.GetPageContent(1); PrTokeniser tokenizer = new PrTokeniser(new RandomAccessFileOrArray(contenidoPageUno)); List <string> strList = new List <string>(); texto = String.Empty; while (tokenizer.NextToken()) { if (tokenizer.TokenType == PrTokeniser.TK_STRING) { strList.Add(tokenizer.StringValue); texto = texto + tokenizer.StringValue; } } pdfReader.Close(); } } else { XWPFDocument doc = new XWPFDocument(file.OpenReadStream()); XWPFWordExtractor extractor = new XWPFWordExtractor(doc); texto = extractor.Text; } return(Json(new { texto })); }
public void Verify_Issue42_CanBe_Processed() { var inPdfFile = TestUtils.GetPdfsPath("issue42.pdf"); var reader = new PdfReader(inPdfFile); var content = reader.GetPageContent(1); var tokenizer = new PrTokeniser(new RandomAccessFileOrArray(content)); var stringsList = new List <string>(); while (tokenizer.NextToken()) { if (tokenizer.TokenType == PrTokeniser.TK_STRING) { stringsList.Add(tokenizer.StringValue); } } reader.Close(); Assert.IsTrue(stringsList.Contains("demonstration")); }
public void Test_Extract_Text() { var pdfFile = createSamplePdfFile(); var reader = new PdfReader(pdfFile); var streamBytes = reader.GetPageContent(1); var tokenizer = new PrTokeniser(new RandomAccessFileOrArray(streamBytes)); var stringsList = new List <string>(); while (tokenizer.NextToken()) { if (tokenizer.TokenType == PrTokeniser.TK_STRING) { stringsList.Add(tokenizer.StringValue); } } reader.Close(); Assert.IsTrue(stringsList.Contains("Hello DNT!")); }
public static string Extract(string path) { var reader = new PdfReader(path); var sb = new StringBuilder(); for (int i = 1; i <= reader.NumberOfPages; i++) { var streamBytes = reader.GetPageContent(i); var tokenizer = new PrTokeniser(new RandomAccessFileOrArray(streamBytes)); while (tokenizer.NextToken()) { if (tokenizer.TokenType == PrTokeniser.TK_STRING) { var currentText = tokenizer.StringValue; currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText))); sb.Append(tokenizer.StringValue + " "); } } } return(sb.ToString()); }
public FileResult ReadPdf() { //获取中文字体,第三个参数表示为是否潜入字体,但只要是编码字体就都会嵌入。 BaseFont baseFont = BaseFont.CreateFont(@"C:\Windows\Fonts\simsun.ttc,1", BaseFont.IDENTITY_H, BaseFont.NOT_EMBEDDED); //读取模板文件 //PdfReader reader = new PdfReader(@"G:\简版征信PDF样本\陈贵年 2019.08.06个人信用报告.pdf"); PdfReader reader = new PdfReader(@"G:\11.pdf"); //创建文件流用来保存填充模板后的文件 System.IO.MemoryStream stream = new System.IO.MemoryStream(); PdfStamper stamp = new PdfStamper(reader, stream); //设置表单字体,在高版本有用,高版本加入这句话就不会插入字体,低版本无用 //stamp.AcroFields.AddSubstitutionFont(baseFont); AcroFields form = stamp.AcroFields; var blankPages = 0; var streamBytes = reader.GetPageContent(1); var tokenizer = new PrTokeniser(new RandomAccessFileOrArray(streamBytes)); var stringsList = new List <string>(); for (var pageNum = 1; pageNum <= reader.NumberOfPages; pageNum++) { // first check, examine the resource dictionary for /Font or /XObject keys. // If either are present -> not blank. var pageDict = reader.GetPageN(pageNum); var resDict = (PdfDictionary)pageDict.Get(PdfName.Resources); var hasFont = resDict.Get(PdfName.Font) != null; if (hasFont) { var fonts = resDict.GetAsString(PdfName.Font); Console.WriteLine($"Page {pageNum} has font(s)."); continue; } var hasImage = resDict.Get(PdfName.Xobject) != null; if (hasImage) { Console.WriteLine($"Page {pageNum} has image(s)."); continue; } var content = reader.GetPageContent(pageNum); if (content.Length <= 20) { Console.WriteLine($"Page {pageNum} is blank"); blankPages++; } } //表单文本框是否锁定 stamp.FormFlattening = true; var sb = new StringBuilder(); var cont = string.Empty; for (int i = 0; i < reader.NumberOfPages; i++) { var s = reader.GetPageContent(i); //取得每一页的字节数组,将每一个字节转换为字符,并将数组转换为字符串 if (s != null) { cont += Encoding.UTF8.GetString(s); for (int j = 0; j < s.Length; j++) { sb.Append(Convert.ToChar(s[j])); } } } var tt = sb.ToString(); var sbb = new StringBuilder(); var sr = stream.ToArray(); for (int j = 0; j < sr.Length; j++) { sbb.Append(Convert.ToChar(sr[j])); } var ss = sbb.ToString(); //按顺序关闭io流 reader.Close(); var x = Encoding.BigEndianUnicode.GetString(stream.GetBuffer()); var x1 = Encoding.Unicode.GetString(stream.GetBuffer()); var x2 = Encoding.ASCII.GetString(stream.GetBuffer()); var x3 = Encoding.Default.GetString(stream.GetBuffer()); //生成文件 FileResult fileResult = new FileContentResult(stream.ToArray(), "application/pdf"); var t = reader.GetType(); //fileResult.FileDownloadName = "4.pdf"; return(fileResult); }
private void ExtractData() { var state = State.Stop; var reader = new PdfReader(_pdfReceipt.OpenReadStream()); var streamBytes = reader.GetPageContent(1); var tokenizer = new PrTokeniser(new RandomAccessFileOrArray(streamBytes)); int num = 1; var pdfReceiptItem = new PdfReceiptItemDto(); bool isFirstWordTaken = false; while (tokenizer.NextToken()) { if (tokenizer.TokenType == PrTokeniser.TK_STRING) { var currentText = tokenizer.StringValue; currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText))); Console.WriteLine(currentText); //GetLocation if (!isFirstWordTaken && !string.IsNullOrWhiteSpace(currentText)) { _location = currentText; isFirstWordTaken = true; } //GetDate if (currentText.Contains("Dato:")) { var formatted = GetFormattedDateString(currentText.Remove(0, 6)); _date = DateTime.Parse(formatted); } //GetReceiptItem if (state == State.Start) { //navn 1 -> antal 2 -> pris 3 -> reset switch (num) { case 1: //name pdfReceiptItem.Name = currentText; if (currentText == "Rabat") { pdfReceiptItem.IsDiscount = true; } num++; break; case 2: //antal var countParsed = double.TryParse(currentText, out double count); if (countParsed) { pdfReceiptItem.Count = (int)count; num++; } break; case 3: var priceParsed = double.TryParse(currentText, out double price); if (priceParsed) { pdfReceiptItem.Price = price; _pdfReceiptItems.Add(pdfReceiptItem); pdfReceiptItem = new PdfReceiptItemDto(); } num = 1; break; } } //setState if (currentText == "Pris") { state = State.Start; } //end else if (currentText == "I alt inkl. moms") { break; } } } }
/// <summary> /// Uses <see cref="iTextSharp"/> library to extract plaintext /// from pdf file. /// </summary> /// <param name="pathToPdf">Path to PDF file</param> /// <returns>Plaintext string</returns> public static string GetPlainText(string pathToPdf) { var pdf = new PdfReader(pathToPdf); //string builder for output var sb = new StringBuilder(); //go page-by-page for (var i = 1; i < pdf.NumberOfPages; i++) { var streamBytes = pdf.GetPageContent(i); var tokeniser = new PrTokeniser(new RandomAccessFileOrArray(streamBytes)); while (tokeniser.NextToken()) { switch (tokeniser.TokenType) { //string tokens seem to encompass everything we're interested in case PrTokeniser.TK_STRING: sb.Append(tokeniser.StringValue); break; //todo:find consistent way of parsing newlines //newline tokens. Seem to be inconsistent, hacky to //add new cases as seen but works for now. case PrTokeniser.TK_NUMBER: if (tokeniser.StringValue.Equals("-1.159")) { sb.Append(Environment.NewLine); } break; case PrTokeniser.TK_OTHER: if (tokeniser.StringValue.Equals("BDC")) { sb.Append(Environment.NewLine); } break; // // these are apparently the newline tokens. Results in LOTS of newlines which breaks parsing // switch (tokeniser.StringValue) // { // // case "ET": // case "TD": // case "Td": // //case "Tm": // //case "T*": // //sb.Append(Environment.NewLine); // sb.Append($"[{tokeniser.StringValue}]"); // break; // default: // break; // } // break; // ReSharper disable once RedundantEmptySwitchSection - keep for debugging default: //if (Debugger.IsAttached) { sb.Append($"[{tokeniser.TokenType}-{tokeniser.StringValue}]"); } break; } } // add newline between pages sb.AppendLine(); //ignore latter sections if (sb.ToString().Contains("ANNEX II")) { break; } } //close reader pdf.Close(); return(sb.ToString()); }