public ComprovantePDF(string path) { var parser = new PDFTextParser(new ParserContext(path)); Texto = parser.Parse(); try { _textAsArray = Texto.Split(_newline).Select(r => r.Trim()).Where(r => !string.IsNullOrEmpty(r)) .ToArray(); if (_textAsArray.First() == "Comprovante") { LerTransferencia(); } else { LerPagamento(); } } catch { } finally { if (!DadosOK) { var data = System.IO.Path.GetFileName(path).Substring(0, 10); var DataArquivo = DateTime.ParseExact(data, "yyyy-MM-dd", CultureInfo.InvariantCulture); Agendamento = Agendamento.Year > 2000 ? Agendamento : DataArquivo; Pagamento = Pagamento.Year > 2000 ? Pagamento : DataArquivo; Valor = Valor == 0 ? 1 : Valor; } } }
public void TestReadBigPDFFile() { string path = TestDataSample.GetPdfPath("Word97-2007BinaryFileFormat(doc)Specification.pdf"); var parser = new PDFTextParser(new ParserContext(path)); string result = parser.Parse(); Assert.IsTrue(true); }
public void TestParsePlainTextFromPDF() { string path = TestDataSample.GetPdfPath("Sample1.PDF"); var parser = new PDFTextParser(new ParserContext(path)); string result = parser.Parse(); Assert.IsTrue(result.StartsWith("LA MARCHE")); }
private string GetTextFromPdf(string file_path) { var context = new ParserContext(file_path); var parser = new PDFTextParser(context); var content = parser.Parse(); return(content.Replace("\n", "\r\n")); }
public void TestParsePlainTextFromSample5() { string path = TestDataSample.GetPdfPath("Sample5.PDF"); var parser = new PDFTextParser(new ParserContext(path)); string result = parser.Parse(); string[] results = result.Split('\n'); Assert.AreEqual("License income by market (%)", results[0]); Assert.AreEqual("Philadelphia, Atlanta, Dallas, San Diego, and New", results[1]); }
public void TestParsePlainTextFromSample1() { string path = TestDataSample.GetPdfPath("Sample1.PDF"); var parser = new PDFTextParser(new ParserContext(path)); string result = parser.Parse(); Assert.IsTrue(result.StartsWith("LA MARCHE")); ContainText(result, "Toute discussion stratégique sur nos actions nécessite un rappel de ce que nous avons fait en"); ContainText(result, "l’an 2000 et depuis. Au niveau mondial, en l’an 2000, nous avons mené une campagne de"); ContainText(result, "Une structure pour nous amener à 2005"); ContainText(result, "Lors de la 4e rencontre qui aura lieu en Inde, nous avons deux objectifs majeurs"); }
public List <Attatchment> FetchAttachementsData() { string index = _source.index; Elastic elastic = new Elastic(index); List <Attatchment> lstAttachments = new List <Attatchment>(); int id = elastic.GetMaxId() + 1; Console.WriteLine("===> Max id:" + id); int maxRetries = _source.retries ?? 5; WebClient wc = new WebClient(); bool end = false; int retry = 0; do { var tempFileName = Path.GetTempFileName(); try { string url = _source.url ?? "{0}"; url = string.Format(url, id++); Console.WriteLine("=> Get " + url); wc.DownloadFile(url, tempFileName); var mimeType = wc.ResponseHeaders["content-type"]; Console.WriteLine("=> Mimetype " + mimeType); var fileName = wc.ResponseHeaders["Content-Disposition"].Substring(wc.ResponseHeaders["Content-Disposition"].IndexOf("filename=") + 9).Replace("\"", ""); Console.WriteLine("=> Filename " + fileName); if (string.IsNullOrEmpty(mimeType)) { end = true; } var body = string.Empty; if (fileName.ToLower().EndsWith(".pdf")) { var pdf = new PDFTextParser(new Toxy.ParserContext(tempFileName)); body = pdf.Parse(); } else if (fileName.ToLower().EndsWith(".docx")) { System.Text.Encoding.RegisterProvider(System.Text.CodePagesEncodingProvider.Instance); var docx = new Word2007TextParser(new Toxy.ParserContext(tempFileName)); body = docx.Parse(); } else if (fileName.ToLower().EndsWith(".rtf")) { System.Text.Encoding.RegisterProvider(System.Text.CodePagesEncodingProvider.Instance); var rtf = new RTFTextParser(new Toxy.ParserContext(tempFileName)); body = rtf.Parse(); } else if (fileName.ToLower().EndsWith(".doc")) { System.Text.Encoding.RegisterProvider(System.Text.CodePagesEncodingProvider.Instance); var doc = new Word2003TextParser(new Toxy.ParserContext(tempFileName)); body = doc.Parse(); } if (!string.IsNullOrEmpty(body)) { Attatchment attatch = new Attatchment(id.ToString(), fileName, url, mimeType, body, DateTime.Now); elastic.SaveItem(attatch); } retry = 0; //lstAttachments.Add(attatch); } catch (Exception ex) { Console.WriteLine(ex); retry++; } try { File.Delete(tempFileName); } catch { } // best effort tempFileName = null; } while (!end && retry < maxRetries); return(lstAttachments); }