Example #1
0
        public ComprovantePDF(string path)
        {
            var parser = new PDFTextParser(new ParserContext(path));

            Texto = parser.Parse();
            try {
                _textAsArray = Texto.Split(_newline).Select(r => r.Trim()).Where(r => !string.IsNullOrEmpty(r))
                               .ToArray();
                if (_textAsArray.First() == "Comprovante")
                {
                    LerTransferencia();
                }
                else
                {
                    LerPagamento();
                }
            }
            catch {
            }
            finally {
                if (!DadosOK)
                {
                    var data        = System.IO.Path.GetFileName(path).Substring(0, 10);
                    var DataArquivo = DateTime.ParseExact(data, "yyyy-MM-dd",
                                                          CultureInfo.InvariantCulture);
                    Agendamento = Agendamento.Year > 2000 ? Agendamento : DataArquivo;
                    Pagamento   = Pagamento.Year > 2000 ? Pagamento : DataArquivo;
                    Valor       = Valor == 0 ? 1 : Valor;
                }
            }
        }
Example #2
0
        public void TestReadBigPDFFile()
        {
            string path   = TestDataSample.GetPdfPath("Word97-2007BinaryFileFormat(doc)Specification.pdf");
            var    parser = new PDFTextParser(new ParserContext(path));
            string result = parser.Parse();

            Assert.IsTrue(true);
        }
Example #3
0
        public void TestParsePlainTextFromPDF()
        {
            string path   = TestDataSample.GetPdfPath("Sample1.PDF");
            var    parser = new PDFTextParser(new ParserContext(path));
            string result = parser.Parse();

            Assert.IsTrue(result.StartsWith("LA MARCHE"));
        }
Example #4
0
        private string GetTextFromPdf(string file_path)
        {
            var context = new ParserContext(file_path);
            var parser  = new PDFTextParser(context);
            var content = parser.Parse();

            return(content.Replace("\n", "\r\n"));
        }
Example #5
0
        public void TestParsePlainTextFromSample5()
        {
            string path   = TestDataSample.GetPdfPath("Sample5.PDF");
            var    parser = new PDFTextParser(new ParserContext(path));
            string result = parser.Parse();

            string[] results = result.Split('\n');
            Assert.AreEqual("License income by market (%)", results[0]);
            Assert.AreEqual("Philadelphia, Atlanta, Dallas, San Diego, and New", results[1]);
        }
Example #6
0
        public void TestParsePlainTextFromSample1()
        {
            string path   = TestDataSample.GetPdfPath("Sample1.PDF");
            var    parser = new PDFTextParser(new ParserContext(path));
            string result = parser.Parse();

            Assert.IsTrue(result.StartsWith("LA MARCHE"));
            ContainText(result, "Toute discussion stratégique sur nos actions nécessite un rappel de ce que nous avons fait en");
            ContainText(result, "l’an 2000 et depuis. Au niveau mondial, en l’an 2000, nous avons mené une campagne de");
            ContainText(result, "Une structure pour nous amener à 2005");
            ContainText(result, "Lors de la 4e rencontre qui aura lieu en Inde, nous avons deux objectifs majeurs");
        }
Example #7
0
        static void FunWithCraftObjects()
        {
            string dirPath      = @"E:\Documents\Tabletop RPGs\Numenera\APPs\";
            string filename     = dirPath + "TEST_Installations.txt";
            string testfilename = dirPath + "TEST_Installations.txt";
            string keywordsFile = dirPath + "KEYWORDS_Craft_Objects.txt";

            // first - create Info
            var info = new PDFTextFileInfo
            {
                ItemsFileName = testfilename,
                TableKeyword  = ""
            };

            PDFTextParser.LoadKeywordsFromFile(info, keywordsFile);

            var lines = File.ReadAllLines(testfilename);
            // second - get string objects
            var parsedObjects = PDFTextParser.SplitItemsToObjects(info, lines);

            File.Delete(dirPath + "TEST_OUTPUT_InBetween.txt");
            foreach (var obj in parsedObjects)
            {
                File.AppendAllLines(dirPath + "TEST_OUTPUT_InBetween.txt", obj);
            }

            // create Dictionary
            var dic = PDFTextParser.CreateDictionariesFromObjects(parsedObjects, info.KeywordsList);

            // and make XML from it
            var xmlInfo = new PDFTextXmlInfo()
            {
                XmlFileName = dirPath + "TEST_OUTPUT_Installations.xml",
                ObjectsName = "CraftObjects",
                ObjectName  = "CraftObject",
                Source      = "Destiny"
            };

            PDFTextXmlCreator.CreateXML(xmlInfo, dic, info.TableKeyword);
        }
Example #8
0
        public List <Attatchment> FetchAttachementsData()
        {
            string  index   = _source.index;
            Elastic elastic = new Elastic(index);

            List <Attatchment> lstAttachments = new List <Attatchment>();

            int id = elastic.GetMaxId() + 1;

            Console.WriteLine("===> Max id:" + id);

            int maxRetries = _source.retries ?? 5;

            WebClient wc = new WebClient();

            bool end   = false;
            int  retry = 0;

            do
            {
                var tempFileName = Path.GetTempFileName();

                try
                {
                    string url = _source.url ?? "{0}";
                    url = string.Format(url, id++);
                    Console.WriteLine("=> Get " + url);

                    wc.DownloadFile(url, tempFileName);
                    var mimeType = wc.ResponseHeaders["content-type"];
                    Console.WriteLine("=> Mimetype " + mimeType);
                    var fileName = wc.ResponseHeaders["Content-Disposition"].Substring(wc.ResponseHeaders["Content-Disposition"].IndexOf("filename=") + 9).Replace("\"", "");
                    Console.WriteLine("=> Filename " + fileName);

                    if (string.IsNullOrEmpty(mimeType))
                    {
                        end = true;
                    }

                    var body = string.Empty;

                    if (fileName.ToLower().EndsWith(".pdf"))
                    {
                        var pdf = new PDFTextParser(new Toxy.ParserContext(tempFileName));
                        body = pdf.Parse();
                    }
                    else if (fileName.ToLower().EndsWith(".docx"))
                    {
                        System.Text.Encoding.RegisterProvider(System.Text.CodePagesEncodingProvider.Instance);
                        var docx = new Word2007TextParser(new Toxy.ParserContext(tempFileName));
                        body = docx.Parse();
                    }
                    else if (fileName.ToLower().EndsWith(".rtf"))
                    {
                        System.Text.Encoding.RegisterProvider(System.Text.CodePagesEncodingProvider.Instance);
                        var rtf = new RTFTextParser(new Toxy.ParserContext(tempFileName));
                        body = rtf.Parse();
                    }
                    else if (fileName.ToLower().EndsWith(".doc"))
                    {
                        System.Text.Encoding.RegisterProvider(System.Text.CodePagesEncodingProvider.Instance);
                        var doc = new Word2003TextParser(new Toxy.ParserContext(tempFileName));

                        body = doc.Parse();
                    }

                    if (!string.IsNullOrEmpty(body))
                    {
                        Attatchment attatch = new Attatchment(id.ToString(), fileName, url, mimeType, body, DateTime.Now);
                        elastic.SaveItem(attatch);
                    }
                    retry = 0;
                    //lstAttachments.Add(attatch);
                }
                catch (Exception ex)
                {
                    Console.WriteLine(ex);
                    retry++;
                }

                try { File.Delete(tempFileName); }
                catch { } // best effort
                tempFileName = null;
            } while (!end && retry < maxRetries);

            return(lstAttachments);
        }