/// <summary> /// Loads the content of a PDF file and counts the length of the paragraphs. /// </summary> /// <param name="path">The file path.</param> public Document(string path) : base(path) { //Check pre-conditions if (!System.IO.Path.GetExtension(path).ToLower().Equals(".pdf")) { throw new Exceptions.FileExtensionNotAllowed(); } //Init object attributes. Lengths = new Dictionary <float, int>(); //Read PDF file and sotre each word appearence inside its paragraph. using (PdfReader reader = new PdfReader(path)) { Utils.TextAsParagraphsExtractionStrategy paragraphReader = new Utils.TextAsParagraphsExtractionStrategy(); for (int i = 1; i <= reader.NumberOfPages; i++) { PdfTextExtractor.GetTextFromPage(reader, i, paragraphReader); } //All the paragraph data is stored inside the paragraphReader (for all the pages). foreach (string paragraph in paragraphReader.Paragraphs) { if (!Lengths.ContainsKey(paragraph.Length)) { Lengths.Add(paragraph.Length, 0); } Lengths[paragraph.Length] += 1; } } }
/// <summary> /// Loads the content of a PDF file and counts how many words and how many times appears on each paragraph within the document. /// </summary> /// <param name="path">The file path.</param> public Document(string path) : base(path) { //Check pre-conditions if (!System.IO.Path.GetExtension(path).ToLower().Equals(".pdf")) { throw new Exceptions.FileExtensionNotAllowed(); } //Init object attributes. Paragraphs = new Dictionary <string, Dictionary <string, int> >(); //Read PDF file and sotre each word appearence inside its paragraph. using (PdfReader reader = new PdfReader(path)) { Utils.TextAsParagraphsExtractionStrategy paragraphReader = new Utils.TextAsParagraphsExtractionStrategy(); for (int i = 1; i <= reader.NumberOfPages; i++) { PdfTextExtractor.GetTextFromPage(reader, i, paragraphReader); } //All the paragraph data is stored inside the paragraphReader (for all the pages). foreach (string paragraph in paragraphReader.Paragraphs) { Dictionary <string, int> words = new Dictionary <string, int>(); foreach (string word in paragraph.Split(" ").Select(x => x.Trim()).Where(x => !string.IsNullOrEmpty(x))) { if (!words.ContainsKey(word)) { words.Add(word, 0); } words[word]++; } if (!Paragraphs.ContainsKey(paragraph)) { Paragraphs.Add(paragraph, words); } else { //Repeated paragraph... can be ignored. } } } }