示例#1
0
        private void parsePDF()
        {
            PDDocument doc = PDDocument.load("2.pdf");
            PDFTextStripper stripper = new PDFTextStripper();
            //stripper.setSortByPosition(true);
            string text = stripper.getText(doc);
            stripper.getSeparateByBeads();
            stripper.getTextLineMatrix();

            Regex regex = new Regex("Сокращенное наименование (.*)\"", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var orgName = regex.Match(text).Groups[1].Value;

            regex = new Regex("ИНН (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var inn = regex.Match(text).Groups[1].Value;

            regex = new Regex("КПП (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var kpp = regex.Match(text).Groups[1].Value;

            regex = new Regex("Должность (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var post = regex.Match(text).Groups[1].Value;

            regex = new Regex("Фамилия (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var lastName = regex.Match(text).Groups[1].Value;

            regex = new Regex("Имя (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var firstName = regex.Match(text).Groups[1].Value;

            regex = new Regex("Отчество (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var surName = regex.Match(text).Groups[1].Value;

            regex = new Regex("ОГРН (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var ogrn = regex.Match(text).Groups[1].Value;

            regex = new Regex("Почтовый индекс (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var postIndex = regex.Match(text).Groups[1].Value;

            regex = new Regex("Субъект Российской Федерации (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var city = regex.Match(text).Groups[1].Value;

            regex = new Regex("Улица /(проспект, переулок и т.д./)  (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var street = regex.Match(text).Groups[1].Value;

            regex = new Regex("Дом /(владение и т.п./)   (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var house = regex.Match(text).Groups[1].Value;

            textBox4.Text = text;
            textBox3.Text = orgName + " / " +inn + " / " +kpp + " / " +post + " / " +lastName + " / " +firstName + " / " +surName +" / " + ogrn + " / " + postIndex + " / "+ city +" / "+street+" / "+house;
            //textBox3.Text = match[0].ToString();

            //string[] lines = text.Split(new string[] { "\r?\n" }, StringSplitOptions.None); // give you all the lines separated by new line

            //string[] cols = lines[0].Split(new string[] { "\\s+ " }, StringSplitOptions.None); // gives array separated by whitespaces

            //textBox3.Text = cols[0].ToString();

            //return stripper.getText(doc);
        }