public bool PdfContains(string text) { var buf = new StringBuilder(); for (var page = 1; page <= pdfReader.NumberOfPages; page++) { var streamBytes = pdfReader.GetPageContent(1); var tokenizer = new PrTokeniser(new RandomAccessFileOrArray(streamBytes)); var stringsList = new List <string>(); while (tokenizer.NextToken()) { if (tokenizer.TokenType == PrTokeniser.TK_STRING) { stringsList.Add(tokenizer.StringValue); _output.WriteLine(stringsList.Last()); var currentText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.UTF8, Encoding.UTF8, Encoding.UTF8.GetBytes(tokenizer.StringValue))); _output.WriteLine(currentText); } } tokenizer.Close(); } pdfReader.Close(); //if (stringsList.Contains(text)) // eturn true; return(false); }
public IActionResult PostearDoc(IFormFile file) { string texto; if (file.ContentType == "application/pdf") { using (MemoryStream ms = new MemoryStream()) { file.CopyTo(ms); byte[] que = ms.ToArray(); PdfReader pdfReader = new PdfReader(que); byte[] contenidoPageUno = pdfReader.GetPageContent(1); PrTokeniser tokenizer = new PrTokeniser(new RandomAccessFileOrArray(contenidoPageUno)); List <string> strList = new List <string>(); texto = String.Empty; while (tokenizer.NextToken()) { if (tokenizer.TokenType == PrTokeniser.TK_STRING) { strList.Add(tokenizer.StringValue); texto = texto + tokenizer.StringValue; } } pdfReader.Close(); } } else { XWPFDocument doc = new XWPFDocument(file.OpenReadStream()); XWPFWordExtractor extractor = new XWPFWordExtractor(doc); texto = extractor.Text; } return(Json(new { texto })); }
public void Verify_Issue42_CanBe_Processed() { var inPdfFile = TestUtils.GetPdfsPath("issue42.pdf"); var reader = new PdfReader(inPdfFile); var content = reader.GetPageContent(1); var tokenizer = new PrTokeniser(new RandomAccessFileOrArray(content)); var stringsList = new List <string>(); while (tokenizer.NextToken()) { if (tokenizer.TokenType == PrTokeniser.TK_STRING) { stringsList.Add(tokenizer.StringValue); } } reader.Close(); Assert.IsTrue(stringsList.Contains("demonstration")); }
public void Test_Extract_Text() { var pdfFile = createSamplePdfFile(); var reader = new PdfReader(pdfFile); var streamBytes = reader.GetPageContent(1); var tokenizer = new PrTokeniser(new RandomAccessFileOrArray(streamBytes)); var stringsList = new List <string>(); while (tokenizer.NextToken()) { if (tokenizer.TokenType == PrTokeniser.TK_STRING) { stringsList.Add(tokenizer.StringValue); } } reader.Close(); Assert.IsTrue(stringsList.Contains("Hello DNT!")); }
public static string Extract(string path) { var reader = new PdfReader(path); var sb = new StringBuilder(); for (int i = 1; i <= reader.NumberOfPages; i++) { var streamBytes = reader.GetPageContent(i); var tokenizer = new PrTokeniser(new RandomAccessFileOrArray(streamBytes)); while (tokenizer.NextToken()) { if (tokenizer.TokenType == PrTokeniser.TK_STRING) { var currentText = tokenizer.StringValue; currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText))); sb.Append(tokenizer.StringValue + " "); } } } return(sb.ToString()); }
private void ExtractData() { var state = State.Stop; var reader = new PdfReader(_pdfReceipt.OpenReadStream()); var streamBytes = reader.GetPageContent(1); var tokenizer = new PrTokeniser(new RandomAccessFileOrArray(streamBytes)); int num = 1; var pdfReceiptItem = new PdfReceiptItemDto(); bool isFirstWordTaken = false; while (tokenizer.NextToken()) { if (tokenizer.TokenType == PrTokeniser.TK_STRING) { var currentText = tokenizer.StringValue; currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText))); Console.WriteLine(currentText); //GetLocation if (!isFirstWordTaken && !string.IsNullOrWhiteSpace(currentText)) { _location = currentText; isFirstWordTaken = true; } //GetDate if (currentText.Contains("Dato:")) { var formatted = GetFormattedDateString(currentText.Remove(0, 6)); _date = DateTime.Parse(formatted); } //GetReceiptItem if (state == State.Start) { //navn 1 -> antal 2 -> pris 3 -> reset switch (num) { case 1: //name pdfReceiptItem.Name = currentText; if (currentText == "Rabat") { pdfReceiptItem.IsDiscount = true; } num++; break; case 2: //antal var countParsed = double.TryParse(currentText, out double count); if (countParsed) { pdfReceiptItem.Count = (int)count; num++; } break; case 3: var priceParsed = double.TryParse(currentText, out double price); if (priceParsed) { pdfReceiptItem.Price = price; _pdfReceiptItems.Add(pdfReceiptItem); pdfReceiptItem = new PdfReceiptItemDto(); } num = 1; break; } } //setState if (currentText == "Pris") { state = State.Start; } //end else if (currentText == "I alt inkl. moms") { break; } } } }
/// <summary> /// Uses <see cref="iTextSharp"/> library to extract plaintext /// from pdf file. /// </summary> /// <param name="pathToPdf">Path to PDF file</param> /// <returns>Plaintext string</returns> public static string GetPlainText(string pathToPdf) { var pdf = new PdfReader(pathToPdf); //string builder for output var sb = new StringBuilder(); //go page-by-page for (var i = 1; i < pdf.NumberOfPages; i++) { var streamBytes = pdf.GetPageContent(i); var tokeniser = new PrTokeniser(new RandomAccessFileOrArray(streamBytes)); while (tokeniser.NextToken()) { switch (tokeniser.TokenType) { //string tokens seem to encompass everything we're interested in case PrTokeniser.TK_STRING: sb.Append(tokeniser.StringValue); break; //todo:find consistent way of parsing newlines //newline tokens. Seem to be inconsistent, hacky to //add new cases as seen but works for now. case PrTokeniser.TK_NUMBER: if (tokeniser.StringValue.Equals("-1.159")) { sb.Append(Environment.NewLine); } break; case PrTokeniser.TK_OTHER: if (tokeniser.StringValue.Equals("BDC")) { sb.Append(Environment.NewLine); } break; // // these are apparently the newline tokens. Results in LOTS of newlines which breaks parsing // switch (tokeniser.StringValue) // { // // case "ET": // case "TD": // case "Td": // //case "Tm": // //case "T*": // //sb.Append(Environment.NewLine); // sb.Append($"[{tokeniser.StringValue}]"); // break; // default: // break; // } // break; // ReSharper disable once RedundantEmptySwitchSection - keep for debugging default: //if (Debugger.IsAttached) { sb.Append($"[{tokeniser.TokenType}-{tokeniser.StringValue}]"); } break; } } // add newline between pages sb.AppendLine(); //ignore latter sections if (sb.ToString().Contains("ANNEX II")) { break; } } //close reader pdf.Close(); return(sb.ToString()); }