Exemple #1
0
        /// <summary>
        /// Extracts a text from a PDF file.
        /// </summary>
        /// <param name="fileName">The full path to the pdf file.</param>
        /// <param name="success">Indicate if operation was successfull.</param>
        /// <returns>The extracted text.</returns>
        internal static String ExtractText(String fileName, out bool success)
        {
            String result = String.Empty;
            PdfReader reader = null;
            success = false;

            try
            {
                reader = new PdfReader(fileName);
                PdfReaderContentParser parser = new PdfReaderContentParser(reader);

                for (int page = 1; page <= reader.NumberOfPages; page++)
                {
                    SimpleTextExtractionStrategy strategy = parser.ProcessContent(page, new SimpleTextExtractionStrategy());
                    result += strategy.GetResultantText();
                }

                success = true;
                return result;
            }
            catch (Exception)
            {
                return String.Empty;
            }
            finally
            {
                if (reader != null)
                {
                    reader.Close();
                }
            }
        }
        public List <string> GetPdfPagesContent(Stream pdfStream)
        {
            List <string>          allPDFPagesContent = new List <string>();
            PdfReader              reader             = new PdfReader(pdfStream);
            PdfReaderContentParser parser             = new PdfReaderContentParser(reader);

            for (int i = 1; i <= reader.NumberOfPages; i++)
            {
                strategy = parser.ProcessContent(i, new SimpleTextExtractionStrategy());
                allPDFPagesContent.Add(strategy.GetResultantText());
            }

            return(allPDFPagesContent);
        }
        /// <summary>
        /// Extract The Text Content From Pdf
        /// </summary>
        /// <param name="sourcePdf">The Pdf in base64 string format</param>
        /// <returns>A string of lines separated by \n </returns>
        public static List <String> ExtractTextPaged(byte[] sourcePdf)
        {
            List <String> _textExtractedPaged = new List <string>();

            using (PdfDocument _sourcePdfDocument = new PdfDocument(new PdfReader(new MemoryStream(sourcePdf))))
            {
                for (int i = 1; i < _sourcePdfDocument.GetNumberOfPages(); i++)
                {
                    FilteredEventListener        listener           = new FilteredEventListener();
                    SimpleTextExtractionStrategy extractionStrategy = listener.AttachEventListener(new SimpleTextExtractionStrategy());
                    new PdfCanvasProcessor(listener).ProcessPageContent(_sourcePdfDocument.GetPage(i));
                    _textExtractedPaged.Add(extractionStrategy.GetResultantText());
                }
            }

            return(_textExtractedPaged);
        }
Exemple #4
0
        //converting pdf to a txt file
        public static void getTextFromPdf(String dest)
        {
            PdfDocument pdfDoc = new PdfDocument(new PdfReader(SRC));

            SimpleTextExtractionStrategy strategy = new SimpleTextExtractionStrategy();

            PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy);

            for (var i = 1; i <= pdfDoc.GetNumberOfPages(); i++)
            {
                parser.ProcessPageContent(pdfDoc.GetPage(i));

                byte[] array = Encoding.UTF8.GetBytes(strategy.GetResultantText());
                using (FileStream stream = new FileStream(dest, FileMode.OpenOrCreate))
                {
                    stream.Write(array, 0, array.Length);
                }
            }
        }
        public static void Main(string[] args)
        {
            Contract.Assert(args.Any(), "No arguments passed.");

            var path          = args.First();
            var extractedText = string.Empty;

            // monsters are on page 208 - 253

            using (var pdfReader = new PdfReader(path))
            {
                var          stringBuilder = new StringBuilder();
                var          pdfDocument   = new PdfDocument(pdfReader);
                var          listener      = new FilteredEventListener();
                CustomFilter customFilter  = null;
                SimpleTextExtractionStrategy extractionStrategy = null;
                PdfCanvasProcessor           pdfProcessor       = null;

                for (int i = 208; i <= 253; i++)
                {
                    Console.WriteLine(@"Reading page {0}.", i);
                    var page = pdfDocument.GetPage(i);

                    if (pdfProcessor == null)
                    {
                        customFilter       = new CustomFilter(page.GetArtBox());
                        extractionStrategy = listener.AttachEventListener(new SimpleTextExtractionStrategy(), customFilter);
                        pdfProcessor       = new PdfCanvasProcessor(listener);
                    }

                    pdfProcessor.ProcessPageContent(page);
                    stringBuilder.AppendLine(extractionStrategy.GetResultantText());

                    if (Program.TextWithWidth.Last.Value.Equals(Program.TextWithWidth.Last.Previous.Value))
                    {
                        Debugger.Break();
                    }
                }

                extractedText = stringBuilder.ToString();
            }

            //m_parser = new TextToMonsterDataParser(extractedText);
            //var monsters = m_parser.GetMonsterData();


            Console.WriteLine("Read all text.");
            Console.ReadLine();

            if (System.IO.File.Exists(@"AllLines.txt"))
            {
                System.IO.File.Delete(@"AllLines.txt");
            }

            if (System.IO.File.Exists(@"EntireText.txt"))
            {
                System.IO.File.Delete(@"EntireText.txt");
            }

            Console.WriteLine("Printing text.");
            System.IO.File.WriteAllText(@"EntireText.txt", extractedText);

            Console.WriteLine("Found {0} duplicates.", TextWithWidth.Count(n => n.String.Equals("die’")));

            Console.WriteLine("Printing lines.");


            using (System.IO.StreamWriter file = new System.IO.StreamWriter(@"AllLines.txt", false))
            {
                var node = TextWithWidth.First;
                while (node != null)
                {
                    file.WriteLine("{0}({1})", node.Value.String, node.Value.Width);
                    node = node.Next;
                }
            }

            Console.ReadLine();
        }
Exemple #6
0
        static void Main()
        {
            Console.OutputEncoding = Encoding.UTF8;
            char[]       nepLat     = { 'c', 'k', 'r', 'v' };
            char[]       nepCir     = { 'ц', 'к', 'р', 'в' };
            string[]     pojmoviOrg = File.ReadAllLines("pojmovi.txt");
            string[]     pojmovi    = (string[])pojmoviOrg.Clone();
            List <int>[] stranice   = new List <int> [pojmovi.Length];
            for (int i = 0; i < stranice.Length; i++)
            {
                stranice[i] = new List <int>();
            }
            PdfDocument pdfDocument = new PdfDocument(new PdfReader("knjiga.pdf"));
            var         pageNumbers = pdfDocument.GetNumberOfPages();

            for (int pageIndex = 1; pageIndex <= pageNumbers; pageIndex++)
            {
                Console.Title = (pageIndex / (float)pageNumbers).ToString("P");
                SimpleTextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                new PdfCanvasProcessor(strategy).ProcessPageContent(pdfDocument.GetPage(pageIndex));
                string page = strategy.GetResultantText().RemoveWrap();

                for (int p = 0; p < pojmovi.Length; p++)
                {
                    pojmovi[p] = pojmovi[p].Replace('-', ' ');                                                                             //briše crte
                    pojmovi[p] = new string(pojmovi[p].Where(c => char.IsLetter(c) || c == ' ' || c == '.' || c == '(').ToArray()).Trim(); //ostavlja samo slova, razmake i tačke
                    string[] podpojmovi = pojmovi[p].Split('(');
                    foreach (string podpojam in podpojmovi)
                    {
                        List <string> pojam    = podpojam.Trim().Split(' ').ToList(); //deli pojam na reči
                        List <string> pojamNep = new List <string>();                 //lista reči bez nepostojanog a
                        List <int>    brNep    = new List <int>();                    //lista brojeva reči bez nepostojanog a
                        for (int i = 0; i < pojam.Count; i++)
                        {
                            if (pojam[i].Length > 3 && pojam[i].Length < 9 && (nepLat.Contains(pojam[i].Last()) || nepCir.Contains(pojam[i].Last())) && (pojam[i][pojam[i].Length - 2] == 'a' || pojam[i][pojam[i].Length - 2] == 'а'))
                            {
                                pojamNep.Add(pojam[i].Substring(0, pojam[i].Length - 2));
                                brNep.Add(i);
                            }
                            if ((pojam.Count > 1 || pojam[i].Length > 3) && pojam[i].Substring(Math.Max(0, pojam[i].Length - 2)) != "ић")
                            {
                                pojam[i] = pojam[i].Remove(pojam[i].Length - (pojam[i].Length > 8 ? 2 : 1)); //briše poslednje slovo sa reči ako pojam ima više od jedne reči ili reč ima više od 3 slova
                            }
                            if (pojam[i].Length < 3)
                            {
                                pojam.RemoveAt(i);
                                i--;
                            }
                        }
                        int minIma   = Math.Max(pojam.Count == 1 ? 1 : 2, (int)Math.Round(pojam.Count * 0.58)); //minimalan broj reči da bi pojam bio nađen
                        int brBezNep = pojam.Count;
                        pojam.AddRange(pojamNep);

                        for (int r = 0; r < pojam.Count; r++)              //za svaku reč u pojmu
                        {
                            if (page.Contains(pojam[r]))                   //ako stranica sadrži reč
                            {
                                int where = page.IndexOf(pojam[r]);        //gde reč počinje na stranici
                                while (where != -1 && where < page.Length) //dokle god ima još takvih reči na stranici
                                {
                                    int           ima      = 1;            //trenutno nađeno reči
                                    List <string> pojamBez = new List <string>(pojam);
                                    pojamBez.RemoveAt(r);                  //lista bez nađene reči
                                    if (r < brBezNep && brNep.IndexOf(r) != -1)
                                    {
                                        pojamBez.RemoveAt(brBezNep + brNep.IndexOf(r) - 1); //lista bez nađene reči bez nepostojanog a
                                    }
                                    else if (r >= brBezNep)
                                    {
                                        pojamBez.RemoveAt(brNep[r - brBezNep]);                                    //lista bez nađene reči sa nepostojanim a
                                    }
                                    int min = Math.Max(0, where - (int)Math.Round(podpojam.Length * 1.5));         //početak opsega je početak reči minus dužina pojma
                                    int max = Math.Min(page.Length - min, (int)Math.Round(podpojam.Length * 3.5)); //dužina opsega je tri puta veća od dužine pojma
                                    foreach (string reč in pojamBez)                                               //za svaku ostalu reč
                                    {
                                        if (page.Substring(min, max).Contains(reč))                                //da li se druga reč nalazi u opsegu prve reči
                                        {
                                            ima++;                                                                 //nađena druga reč
                                        }
                                        if (ima >= minIma)
                                        {
                                            break; //nađen pojam
                                        }
                                    }
                                    if (ima >= minIma && !stranice[p].Contains(pageIndex) && (pageIndex < 406 || pageIndex > 416) && pageIndex != 5) //ako je pojam nađen i ako već nije nađen na ovoj stranici
                                    {
                                        stranice[p].Add(pageIndex);                                                                                  //dodaj broj stranice na spisak
                                        break;                                                                                                       //ne traži više tu reč na stranici
                                    }
                                    int newWhere = page.Substring(where + pojam[r].Length).IndexOf(pojam[r]);
                                    where = newWhere != -1 ? newWhere + where + 1 : -1;  //traži da li ima još takve reči na stranici
                                }
                            }
                        }
                    }
                }
            }

            var file = new StreamWriter("registar.txt");

            for (int i = 0; i < pojmoviOrg.Length; i++)
            {
                file.Write(pojmoviOrg[i] + " –");
                foreach (int stranica in stranice[i])
                {
                    file.Write(" " + stranica + (stranica != stranice[i].Last() ? "," : "."));
                }
                file.Write("\n");
            }

            file.Close();
            pdfDocument.Close();
        }