private static void KamilPdfTest(string input) { PDDocument doc = null; try { doc = PDDocument.load(input); PDFTextStripper stripper = new PDFTextStripper(); // stripper.getText(doc); Matrix line = stripper.getTextLineMatrix(); // int page_nr = stripper.getCurrentPageNo(); PDPage page = stripper.getCurrentPage(); Matrix line2 = stripper.getTextMatrix(); int char_cnt = stripper.getTotalCharCnt(); string article_start = stripper.getArticleStart(); string article_end = stripper.getArticleEnd(); string pdf = stripper.getText(doc); // wrzuca caly tekst do sringa - dziala char_cnt = pdf.Length; } finally { if (doc != null) { doc.close(); } } }
private void parsePDF() { PDDocument doc = PDDocument.load("2.pdf"); PDFTextStripper stripper = new PDFTextStripper(); //stripper.setSortByPosition(true); string text = stripper.getText(doc); stripper.getSeparateByBeads(); stripper.getTextLineMatrix(); Regex regex = new Regex("Сокращенное наименование (.*)\"", RegexOptions.Multiline | RegexOptions.IgnoreCase); var orgName = regex.Match(text).Groups[1].Value; regex = new Regex("ИНН (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase); var inn = regex.Match(text).Groups[1].Value; regex = new Regex("КПП (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase); var kpp = regex.Match(text).Groups[1].Value; regex = new Regex("Должность (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase); var post = regex.Match(text).Groups[1].Value; regex = new Regex("Фамилия (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase); var lastName = regex.Match(text).Groups[1].Value; regex = new Regex("Имя (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase); var firstName = regex.Match(text).Groups[1].Value; regex = new Regex("Отчество (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase); var surName = regex.Match(text).Groups[1].Value; regex = new Regex("ОГРН (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase); var ogrn = regex.Match(text).Groups[1].Value; regex = new Regex("Почтовый индекс (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase); var postIndex = regex.Match(text).Groups[1].Value; regex = new Regex("Субъект Российской Федерации (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase); var city = regex.Match(text).Groups[1].Value; regex = new Regex("Улица /(проспект, переулок и т.д./) (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase); var street = regex.Match(text).Groups[1].Value; regex = new Regex("Дом /(владение и т.п./) (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase); var house = regex.Match(text).Groups[1].Value; textBox4.Text = text; textBox3.Text = orgName + " / " +inn + " / " +kpp + " / " +post + " / " +lastName + " / " +firstName + " / " +surName +" / " + ogrn + " / " + postIndex + " / "+ city +" / "+street+" / "+house; //textBox3.Text = match[0].ToString(); //string[] lines = text.Split(new string[] { "\r?\n" }, StringSplitOptions.None); // give you all the lines separated by new line //string[] cols = lines[0].Split(new string[] { "\\s+ " }, StringSplitOptions.None); // gives array separated by whitespaces //textBox3.Text = cols[0].ToString(); //return stripper.getText(doc); }