private void Form1_Load(object sender, EventArgs e)
        {
            PdfReader reader = new PdfReader(System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "Document.pdf"));
            TextWithFontExtractionStategy S = new TextWithFontExtractionStategy();
            string F = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, 1, S);

            Console.WriteLine(F);

            this.Close();
        }
        private void Form1_Load(object sender, EventArgs e)
        {
            PdfReader reader = new PdfReader(System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "nmat4-42.pdf"));
            TextWithFontExtractionStategy S = new TextWithFontExtractionStategy();
            string F = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, 1, S);

            //Buffers to hold various parts from the PDF
            List <string> titles  = new List <string>();
            List <string> authors = new List <string>();

            //Array of lines of text
            string[] lines = F.Split(new string[] { Environment.NewLine }, StringSplitOptions.None);

            //Temporary string
            string t;

            //Loop through each line in the array
            foreach (string line in lines)
            {
                //See if the line looks like a "title"
                if (line.Contains("HelveticaNeue-LightExt") && line.Contains("font-size:17.28003"))
                {
                    //Remove the HTML tags
                    titles.Add(System.Text.RegularExpressions.Regex.Replace(line, "</?span.*?>", "").Trim());
                }
                //See if the line looks like an "author"
                else if (line.Contains("HelveticaNeue-Condensed") && line.Contains("font-size:9.995972"))
                {
                    //Remove the HTML tags and trim extra characters
                    t = System.Text.RegularExpressions.Regex.Replace(line, "</?span.*?>", "").Trim(new char[] { ' ', ',', '*' });
                    //Make sure we have a valid name, probably need some more exceptions here, too
                    if (!string.IsNullOrWhiteSpace(t) && t != "AND")
                    {
                        authors.Add(t);
                    }
                }
            }
            //Write out the title to the console
            Console.WriteLine("Title  : {0}", string.Join(" ", titles.ToArray()));
            //Write out each author
            foreach (string author in authors)
            {
                Console.WriteLine("Author : {0}", author);
            }
            Console.WriteLine(F);

            this.Close();
        }
Example #3
0
        /// <summary>
        /// Searching for red words hex FFFF0000. Note that some phrases might be cut into chunks, so number of red words might be higher than actual red phrases.
        /// </summary>
        /// <param name="path"> System path to PDF file. </param>
        /// <returns>Count of red words. </returns>
        public static int SearchFile(string path)
        {
            SetPDFPagesCount(path);
            _redWords = 0;

            PdfReader reader = new PdfReader(path);
            TextWithFontExtractionStategy S = new TextWithFontExtractionStategy();

            for (int i = 1; i < PdfPages + 1; i++)
            {
                //Console.WriteLine("Analyzing page: " + i.ToString());
                string F = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, i, S);
            }

            return(_redWords);
        }
Example #4
0
        /// <summary>
        /// Looks for number of phrases with fontcolor RED and for text match. Returns string composed of phrases count and bool if text matched.
        /// </summary>
        public static string SearchFile(string path, string searchText, bool getRedPhrasesCount)
        {
            SetPDFPagesCount(path); //set internally how many pages should be looped

            _redWords = 0;
            string rstring = "";

            if (getRedPhrasesCount == true)
            {
                rstring += "Number of red phrases: ";

                PdfReader reader = new PdfReader(path);
                TextWithFontExtractionStategy S = new TextWithFontExtractionStategy();
                for (int i = 1; i < PdfPages + 1; i++)
                {
                    //Console.WriteLine("Analyzing page: " + i.ToString());
                    string F = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, i, S);
                }
                //Console.WriteLine(F);
                rstring = rstring + _redWords.ToString();

                rstring += ". ";
            }
            bool IsMatched = SearchFile(path, searchText);

            if (IsMatched == false)
            {
                rstring += "Match text NOT found.";
            }
            else
            {
                rstring += "Match text found.";
            }

            return(rstring);
        }
Example #5
0
        /// <summary>
        /// Gets and checks footnote order.
        /// </summary>
        /// <param name="path"></param>
        private static void GetFootnote(string path)
        {
            currentFontSize      = 0;
            currentFootnoteValue = 0;
            _prevInumber         = 0;

            _prevDoubleText = "";
            _prevTopRight   = (float)0;

            replaceCounter   = 0;
            _doFootnoteCheck = true;

            dt = new DataTable();
            dt.Columns.Add("Number", typeof(int));
            dt.Columns.Add("Position", typeof(float));
            dt.Columns.Add("Page", typeof(float));

            SetPDFPagesCount(path);
            PdfReader reader2 = new PdfReader(path);
            TextWithFontExtractionStategy S = new TextWithFontExtractionStategy();

            //clear for double badfootnote check issue from 15-10-2014
            dt.Rows.Clear();
            dictionary.Clear();
            //Console.WriteLine("Doc START");
            for (int i = 1; i < PdfPages + 1; i++)
            {
                pageCounter = i;
                //Console.WriteLine("Analyzing page: " + i.ToString());
                string F = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader2, i, S);
            }

            int      prev = 0;
            DataView dv   = dt.DefaultView;

            dv.Sort = " Page asc, position desc";
            DataTable sortedDT = dv.ToTable();

            foreach (DataRow row in sortedDT.Rows)
            {
                if (prev + 1 == Int32.Parse(row[0].ToString()) || prev == Int32.Parse(row[0].ToString()))
                {
                }
                else
                {
                    _badFootnoteFound = true;
                }
                prev = Int32.Parse(row[0].ToString());
            }
            //Console.WriteLine("Doc END");

            /*
             * foreach (DataRow row in sortedDT.Rows)
             * {
             *  Console.WriteLine(row[0].ToString() + " --- " + row[1].ToString() + " --- " + row[2].ToString());
             *
             * }
             */

            dictionary.Clear();
            dt.Clear();

            prev           = 0;
            _prevInumber   = 0;
            replaceCounter = 0;
        }