/// <summary> /// Parse an document /// </summary> /// <param name="document">Document to parse</param> /// <returns>Chunk of text</returns> public virtual IEnumerable<DocumentChunk> Parse(DocumentIdentity document) { List<DocumentChunk> result = new List<DocumentChunk>(); Contract.Assert(document != null); Contract.Result<IEnumerable<DocumentChunk>>(); Contract.Ensures(result != null, "Empty collection can be returned but not null reference"); using (StreamReader reader = File.OpenText(document.FilePath)) { DocumentChunk chunk = new DocumentChunk(); StringBuilder text = new StringBuilder(); while (!reader.EndOfStream) { string line = reader.ReadLine(); bool isEmptyParagraph = line.Length == 0; if (isEmptyParagraph && text.Length > 0) { chunk.Metadata = "Content"; chunk.Text = text.ToString(); result.Add(chunk); text.Clear(); chunk = new DocumentChunk(); } else { text.Append(line); } } // Post-Loop action : add remaining chunk if (text.Length > 0) { chunk.Text = text.ToString(); chunk.Metadata = "Content"; result.Add(chunk); } } return result; }
/// <summary> /// Parse an PDF document and extract the text content /// </summary> /// <param name="document">Document to analyze</param> /// <returns>One text chunk per page</returns> public IEnumerable<DocumentChunk> Parse(DocumentIdentity document) { IList<DocumentChunk> result = new List<DocumentChunk>(); Contract.Assert(document != null); Contract.Result<IEnumerable<DocumentChunk>>(); Contract.Ensures(result != null, "Empty collection can be returned but not null reference"); using (PdfReader reader = new PdfReader(File.Open(document.FilePath, FileMode.Open))) { for (int i = 1; i <= reader.NumberOfPages; i++) { DocumentChunk chunk = new DocumentChunk(); chunk.Text = PdfTextExtractor.GetTextFromPage(reader, i); chunk.Metadata = "Content"; if (!String.IsNullOrEmpty(chunk.Text)) { result.Add(chunk); } } } return result; }