private void Form1_Load(object sender, EventArgs e) { PdfReader reader = new PdfReader(System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "Document.pdf")); TextWithFontExtractionStategy S = new TextWithFontExtractionStategy(); string F = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, 1, S); Console.WriteLine(F); this.Close(); }
private void Form1_Load(object sender, EventArgs e) { PdfReader reader = new PdfReader(System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "nmat4-42.pdf")); TextWithFontExtractionStategy S = new TextWithFontExtractionStategy(); string F = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, 1, S); //Buffers to hold various parts from the PDF List <string> titles = new List <string>(); List <string> authors = new List <string>(); //Array of lines of text string[] lines = F.Split(new string[] { Environment.NewLine }, StringSplitOptions.None); //Temporary string string t; //Loop through each line in the array foreach (string line in lines) { //See if the line looks like a "title" if (line.Contains("HelveticaNeue-LightExt") && line.Contains("font-size:17.28003")) { //Remove the HTML tags titles.Add(System.Text.RegularExpressions.Regex.Replace(line, "</?span.*?>", "").Trim()); } //See if the line looks like an "author" else if (line.Contains("HelveticaNeue-Condensed") && line.Contains("font-size:9.995972")) { //Remove the HTML tags and trim extra characters t = System.Text.RegularExpressions.Regex.Replace(line, "</?span.*?>", "").Trim(new char[] { ' ', ',', '*' }); //Make sure we have a valid name, probably need some more exceptions here, too if (!string.IsNullOrWhiteSpace(t) && t != "AND") { authors.Add(t); } } } //Write out the title to the console Console.WriteLine("Title : {0}", string.Join(" ", titles.ToArray())); //Write out each author foreach (string author in authors) { Console.WriteLine("Author : {0}", author); } Console.WriteLine(F); this.Close(); }
/// <summary> /// Searching for red words hex FFFF0000. Note that some phrases might be cut into chunks, so number of red words might be higher than actual red phrases. /// </summary> /// <param name="path"> System path to PDF file. </param> /// <returns>Count of red words. </returns> public static int SearchFile(string path) { SetPDFPagesCount(path); _redWords = 0; PdfReader reader = new PdfReader(path); TextWithFontExtractionStategy S = new TextWithFontExtractionStategy(); for (int i = 1; i < PdfPages + 1; i++) { //Console.WriteLine("Analyzing page: " + i.ToString()); string F = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, i, S); } return(_redWords); }
/// <summary> /// Looks for number of phrases with fontcolor RED and for text match. Returns string composed of phrases count and bool if text matched. /// </summary> public static string SearchFile(string path, string searchText, bool getRedPhrasesCount) { SetPDFPagesCount(path); //set internally how many pages should be looped _redWords = 0; string rstring = ""; if (getRedPhrasesCount == true) { rstring += "Number of red phrases: "; PdfReader reader = new PdfReader(path); TextWithFontExtractionStategy S = new TextWithFontExtractionStategy(); for (int i = 1; i < PdfPages + 1; i++) { //Console.WriteLine("Analyzing page: " + i.ToString()); string F = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, i, S); } //Console.WriteLine(F); rstring = rstring + _redWords.ToString(); rstring += ". "; } bool IsMatched = SearchFile(path, searchText); if (IsMatched == false) { rstring += "Match text NOT found."; } else { rstring += "Match text found."; } return(rstring); }
/// <summary> /// Gets and checks footnote order. /// </summary> /// <param name="path"></param> private static void GetFootnote(string path) { currentFontSize = 0; currentFootnoteValue = 0; _prevInumber = 0; _prevDoubleText = ""; _prevTopRight = (float)0; replaceCounter = 0; _doFootnoteCheck = true; dt = new DataTable(); dt.Columns.Add("Number", typeof(int)); dt.Columns.Add("Position", typeof(float)); dt.Columns.Add("Page", typeof(float)); SetPDFPagesCount(path); PdfReader reader2 = new PdfReader(path); TextWithFontExtractionStategy S = new TextWithFontExtractionStategy(); //clear for double badfootnote check issue from 15-10-2014 dt.Rows.Clear(); dictionary.Clear(); //Console.WriteLine("Doc START"); for (int i = 1; i < PdfPages + 1; i++) { pageCounter = i; //Console.WriteLine("Analyzing page: " + i.ToString()); string F = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader2, i, S); } int prev = 0; DataView dv = dt.DefaultView; dv.Sort = " Page asc, position desc"; DataTable sortedDT = dv.ToTable(); foreach (DataRow row in sortedDT.Rows) { if (prev + 1 == Int32.Parse(row[0].ToString()) || prev == Int32.Parse(row[0].ToString())) { } else { _badFootnoteFound = true; } prev = Int32.Parse(row[0].ToString()); } //Console.WriteLine("Doc END"); /* * foreach (DataRow row in sortedDT.Rows) * { * Console.WriteLine(row[0].ToString() + " --- " + row[1].ToString() + " --- " + row[2].ToString()); * * } */ dictionary.Clear(); dt.Clear(); prev = 0; _prevInumber = 0; replaceCounter = 0; }