Ejemplo n.º 1
0
        /// <summary>
        /// Parse an document
        /// </summary>
        /// <param name="document">Document to parse</param>
        /// <returns>Chunk of text</returns>
        public virtual IEnumerable<DocumentChunk> Parse(DocumentIdentity document)
        {
            List<DocumentChunk> result = new List<DocumentChunk>();
            Contract.Assert(document != null);
            Contract.Result<IEnumerable<DocumentChunk>>();
            Contract.Ensures(result != null, "Empty collection can be returned but not null reference");

            using (StreamReader reader = File.OpenText(document.FilePath))
            {
                DocumentChunk chunk = new DocumentChunk();
                StringBuilder text = new StringBuilder();
                while (!reader.EndOfStream)
                {
                    string line = reader.ReadLine();
                    bool isEmptyParagraph = line.Length == 0;
                    if (isEmptyParagraph && text.Length > 0)
                    {
                        chunk.Metadata = "Content";
                        chunk.Text = text.ToString();
                        result.Add(chunk);
                        text.Clear();
                        chunk = new DocumentChunk();
                    }
                    else
                    {
                        text.Append(line);
                    }
                }
                // Post-Loop action : add remaining chunk
                if (text.Length > 0)
                {
                    chunk.Text = text.ToString();
                    chunk.Metadata = "Content";
                    result.Add(chunk);
                }
            }

            return result;
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Parse an PDF document and extract the text content
        /// </summary>
        /// <param name="document">Document to analyze</param>
        /// <returns>One text chunk per page</returns>
        public IEnumerable<DocumentChunk> Parse(DocumentIdentity document)
        {
            IList<DocumentChunk> result = new List<DocumentChunk>();
            Contract.Assert(document != null);
            Contract.Result<IEnumerable<DocumentChunk>>();
            Contract.Ensures(result != null, "Empty collection can be returned but not null reference");

            using (PdfReader reader = new PdfReader(File.Open(document.FilePath, FileMode.Open)))
            {
                for (int i = 1; i <= reader.NumberOfPages; i++)
                {
                    DocumentChunk chunk = new DocumentChunk();
                    chunk.Text = PdfTextExtractor.GetTextFromPage(reader, i);
                    chunk.Metadata = "Content";

                    if (!String.IsNullOrEmpty(chunk.Text))
                    {
                        result.Add(chunk);
                    }
                }
            }

            return result;
        }