Example #1
0
        private static void KamilPdfTest(string input)
        {
            PDDocument doc = null;

            try
            {
                doc = PDDocument.load(input);
                PDFTextStripper stripper = new PDFTextStripper();
                // stripper.getText(doc);


                Matrix line = stripper.getTextLineMatrix();
                // int page_nr = stripper.getCurrentPageNo();
                PDPage page     = stripper.getCurrentPage();
                Matrix line2    = stripper.getTextMatrix();
                int    char_cnt = stripper.getTotalCharCnt();

                string article_start = stripper.getArticleStart();
                string article_end   = stripper.getArticleEnd();



                string pdf = stripper.getText(doc);                                     // wrzuca caly tekst do sringa - dziala
                char_cnt = pdf.Length;
            }
            finally
            {
                if (doc != null)
                {
                    doc.close();
                }
            }
        }
Example #2
0
        private void parsePDF()
        {
            PDDocument doc = PDDocument.load("2.pdf");
            PDFTextStripper stripper = new PDFTextStripper();
            //stripper.setSortByPosition(true);
            string text = stripper.getText(doc);
            stripper.getSeparateByBeads();
            stripper.getTextLineMatrix();

            Regex regex = new Regex("Сокращенное наименование (.*)\"", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var orgName = regex.Match(text).Groups[1].Value;

            regex = new Regex("ИНН (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var inn = regex.Match(text).Groups[1].Value;

            regex = new Regex("КПП (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var kpp = regex.Match(text).Groups[1].Value;

            regex = new Regex("Должность (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var post = regex.Match(text).Groups[1].Value;

            regex = new Regex("Фамилия (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var lastName = regex.Match(text).Groups[1].Value;

            regex = new Regex("Имя (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var firstName = regex.Match(text).Groups[1].Value;

            regex = new Regex("Отчество (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var surName = regex.Match(text).Groups[1].Value;

            regex = new Regex("ОГРН (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var ogrn = regex.Match(text).Groups[1].Value;

            regex = new Regex("Почтовый индекс (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var postIndex = regex.Match(text).Groups[1].Value;

            regex = new Regex("Субъект Российской Федерации (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var city = regex.Match(text).Groups[1].Value;

            regex = new Regex("Улица /(проспект, переулок и т.д./)  (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var street = regex.Match(text).Groups[1].Value;

            regex = new Regex("Дом /(владение и т.п./)   (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var house = regex.Match(text).Groups[1].Value;

            textBox4.Text = text;
            textBox3.Text = orgName + " / " +inn + " / " +kpp + " / " +post + " / " +lastName + " / " +firstName + " / " +surName +" / " + ogrn + " / " + postIndex + " / "+ city +" / "+street+" / "+house;
            //textBox3.Text = match[0].ToString();

            //string[] lines = text.Split(new string[] { "\r?\n" }, StringSplitOptions.None); // give you all the lines separated by new line

            //string[] cols = lines[0].Split(new string[] { "\\s+ " }, StringSplitOptions.None); // gives array separated by whitespaces

            //textBox3.Text = cols[0].ToString();

            //return stripper.getText(doc);
        }