Beispiel #1
0
        public IEnumerable<DocumentChunk> Parse(DocumentIdentity document)
        {
            List<DocumentChunk> chuncks = new List<DocumentChunk>(10);
            using (TextReader s = File.OpenText(document.FilePath))
            {
                string line = string.Empty;
                while ((line = s.ReadLine()) != null)
                {
                    if (fromRegex.IsMatch(line) && emailRegex.IsMatch(line))
                    {
                        foreach(DocumentChunk c in ExtractEmails(line))
                        {
                            c.Metadata = "from";
                            chuncks.Add(c);
                        }

                    }
                    if (toRegex.IsMatch(line) && emailRegex.IsMatch(line))
                    {
                        foreach (DocumentChunk c in ExtractEmails(line))
                        {
                            c.Metadata = "to";
                            chuncks.Add(c);
                        }
                    }
                }
            }
            return chuncks;
        }
        public void TestParsePdfText()
        {
            IParser parser = new PortableDocumentFormat();
            DocumentIdentity doc = new DocumentIdentity();
            doc.FilePath = Path.GetFullPath(textPdfFile);
            var chuncks =  parser.Parse(doc);

             Assert.IsTrue(chuncks.Count() > 0, "The PDF doc containt text");
        }
Beispiel #3
0
 /// <summary>
 /// Return the good parser for a document based on the extension
 /// </summary>
 /// <param name="doc">Document </param>
 /// <returns></returns>
 public IParser GetParser(DocumentIdentity doc)
 {
     IParser parser = null;
     Func<IParser, bool> predicate = (IParser p) => p.SupportedFileExtensions.Contains(Path.GetExtension(doc.FilePath).ToLower());
     if (Parsers.Count(predicate) == 1)
     {
         parser = Parsers.Single(predicate);
     }
     return parser;
 }
Beispiel #4
0
        public void TestParseNoChunks()
        {
            IParser plain = new PlainText();
            DocumentIdentity doc = new DocumentIdentity()
            {
                FilePath = Path.GetFullPath(@"Lucy.Plugin.Parser\plainText_0Chunks.txt")
            };
            IEnumerable<DocumentChunk> chunks = plain.Parse(doc);

            Assert.IsNotNull(chunks, "The result can be empty but not null");
            Assert.IsTrue(chunks.Count() == 0, "We should have not any chunk");
        }
Beispiel #5
0
        /// <summary>
        /// Add a document for indexing
        /// </summary>
        /// <param name="doc">document to add</param>
        public virtual void Add(DocumentIdentity doc)
        {
            Contract.Assert(doc != null);
            Contract.Assert(doc.FilePath != null);

            int nbOfPath = this.DocumentIdentity.Count(p => p.FilePath == doc.FilePath);

            if (nbOfPath == 0)
            {
                this.DocumentIdentity.Add(doc);
            }
        }
Beispiel #6
0
        public void TestParseEmail()
        {
            DocumentIdentity doc = new DocumentIdentity()
            {
                FilePath = Path.GetFullPath(@"Lucy.Plugin.Parser\simpleMail.eml")
            };
            IParser plain = new PlainEml();
            var r = plain.Parse(doc);
            Assert.IsTrue(r.Count() == 3, "must be found 3 chuncks, 1 from and 2 to");
            Assert.IsTrue(r.Count(p => p.Metadata == "from") == 1, "from not found");
            Assert.IsTrue(r.Count(p => p.Metadata == "to") == 2, "to not found");


        }
Beispiel #7
0
        public void ComparisonPartialyLoaded()
        {
            // ref
            DocumentIdentity docA = new DocumentIdentity();
            docA.FilePath = Path.GetFullPath(fileA);
            // same file
            DocumentIdentity docAbis = new DocumentIdentity();
            docAbis.FilePath = Path.GetFullPath(fileA);
            Assert.IsTrue(docA.CompareTo(docAbis) == 0, "Same files");
            
            // File differ
            DocumentIdentity docB = new DocumentIdentity();
            docB.FilePath = Path.GetFullPath(fileB);
            Assert.IsFalse(docA.CompareTo(docB) == 0, "Not same files");

        }
Beispiel #8
0
        public void TestIndexUnicity()
        {
            DocumentIdentity document1ABis = new DocumentIdentity();
            document1ABis.Checksum = document1A.Checksum;
            document1ABis.FilePath = document1A.FilePath;
            document1ABis.DocumentID = document1A.DocumentID;
            document1ABis.State = IndexationStates.Undefined;

            DocumentIndex index = new DocumentIndex(IndexDir);
            index.Add(document1A);
            index.Add(document1ABis);
            Assert.IsTrue(index.DocumentIdentity.Count == 1, "One different document added");

            document1ABis.State = IndexationStates.NotIndexed;
            index.Add(document1ABis);
            Assert.IsTrue(index.DocumentIdentity.Count == 1, "One different document added");
        }
        /// <summary>
        /// Explore a location and return individual documents
        /// </summary>
        /// <param name="location"></param>
        /// <returns>Document</returns>
        public virtual ICollection<DocumentIdentity> Discover(DocumentLocation location)
        {
            Contract.Assert(location != null);
            List<DocumentIdentity> documents = new List<DocumentIdentity>();
            location.State = DiscoveryStates.Exploring;
            DirectoryInfo dir = new DirectoryInfo(location.Location);
            IEnumerable<FileInfo> files = dir.EnumerateFiles("*.*", SearchOption.AllDirectories);
            Parallel.ForEach(files, (FileInfo file) =>
            {
                try
                {
                    if (!file.Exists)
                    {
                        return;
                    }

                    DocumentIdentity identity = new DocumentIdentity();
                    identity.DocumentID = ComputeId(file.FullName);
                    identity.Checksum = ComputeChecksum(file);
                    identity.State = IndexationStates.NotIndexed;
                    identity.FilePath = file.FullName;
                    identity.LastIndexed = null;
                    documents.Add(identity);
                }
                catch (IOException ex)
                {

                }
                catch (InvalidOperationException ex)
                {
                    //Logger.Error("Checksum failed");
                    //Logger.Error(ex);
                }
                catch (UnauthorizedAccessException ex)
                {
                    //    Logger.Error("Not authorized to accces to" + file.FullName);
                    //    Logger.Error(ex);
                }
            });

            location.LastDiscovered = DateTime.Now;
            location.State = DiscoveryStates.Explored;
            return documents;
        }
Beispiel #10
0
        /// <summary>
        /// Parse an document
        /// </summary>
        /// <param name="document">Document to parse</param>
        /// <returns>Chunk of text</returns>
        public virtual IEnumerable<DocumentChunk> Parse(DocumentIdentity document)
        {
            List<DocumentChunk> result = new List<DocumentChunk>();
            Contract.Assert(document != null);
            Contract.Result<IEnumerable<DocumentChunk>>();
            Contract.Ensures(result != null, "Empty collection can be returned but not null reference");

            using (StreamReader reader = File.OpenText(document.FilePath))
            {
                DocumentChunk chunk = new DocumentChunk();
                StringBuilder text = new StringBuilder();
                while (!reader.EndOfStream)
                {
                    string line = reader.ReadLine();
                    bool isEmptyParagraph = line.Length == 0;
                    if (isEmptyParagraph && text.Length > 0)
                    {
                        chunk.Metadata = "Content";
                        chunk.Text = text.ToString();
                        result.Add(chunk);
                        text.Clear();
                        chunk = new DocumentChunk();
                    }
                    else
                    {
                        text.Append(line);
                    }
                }
                // Post-Loop action : add remaining chunk
                if (text.Length > 0)
                {
                    chunk.Text = text.ToString();
                    chunk.Metadata = "Content";
                    result.Add(chunk);
                }
            }

            return result;
        }
Beispiel #11
0
        public void ComparisonChecksum()
        {
            //Ref
            DocumentIdentity docA = new DocumentIdentity();
            docA.FilePath = Path.GetFullPath(fileA);
            docA.State = IndexationStates.Indexed;
            docA.Checksum = "AAAA";
            
            // Same checksum
            DocumentIdentity docB = new DocumentIdentity();
            docB.FilePath = Path.GetFullPath(fileA);
            docB.State = IndexationStates.Indexed;
            docB.Checksum = "AAAA";
            Assert.IsTrue(docA.CompareTo(docB) == 0, "Same files");
            
            // Same file, checksum differt
            DocumentIdentity docC = new DocumentIdentity();
            docC.FilePath = Path.GetFullPath(fileA);
            docC.State = IndexationStates.Indexed;
            docC.Checksum = "CCCC";
            Assert.IsFalse(docA.CompareTo(docC) == 0, "Not same files");

        }
        /// <summary>
        /// Parse an PDF document and extract the text content
        /// </summary>
        /// <param name="document">Document to analyze</param>
        /// <returns>One text chunk per page</returns>
        public IEnumerable<DocumentChunk> Parse(DocumentIdentity document)
        {
            IList<DocumentChunk> result = new List<DocumentChunk>();
            Contract.Assert(document != null);
            Contract.Result<IEnumerable<DocumentChunk>>();
            Contract.Ensures(result != null, "Empty collection can be returned but not null reference");

            using (PdfReader reader = new PdfReader(File.Open(document.FilePath, FileMode.Open)))
            {
                for (int i = 1; i <= reader.NumberOfPages; i++)
                {
                    DocumentChunk chunk = new DocumentChunk();
                    chunk.Text = PdfTextExtractor.GetTextFromPage(reader, i);
                    chunk.Metadata = "Content";

                    if (!String.IsNullOrEmpty(chunk.Text))
                    {
                        result.Add(chunk);
                    }
                }
            }

            return result;
        }
Beispiel #13
0
 /// <summary>
 /// Remove a document from the indexing list
 /// </summary>
 /// <param name="doc"></param>
 public virtual void Remove(DocumentIdentity doc)
 {
     this.Remove(doc);
 }
Beispiel #14
0
        /// <summary>
        /// Perform document parsing and update lucen index
        /// </summary>
        /// <param name="write">Indew writer</param>
        /// <param name="doc">Document to index</param>
        private void Index(IndexWriter write, DocumentIdentity doc)
        {
            Document lucenDoc = new Lucene.Net.Documents.Document();

            // Metada
            Field docID = new Field("ID", doc.DocumentID, Field.Store.YES, Field.Index.NO, Field.TermVector.NO);
            lucenDoc.Add(docID);
            Field docName = new Field("Name", Path.GetFileNameWithoutExtension( doc.FilePath), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO);
            lucenDoc.Add(docName);
            Field docExtention = new Field("Extention",Path.GetExtension( doc.FilePath), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO);
            lucenDoc.Add(docExtention);
            Field docCRC = new Field("Checksum", doc.Checksum, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO);
            lucenDoc.Add(docCRC);
            Field locationField = new Field("Location", Path.GetDirectoryName( doc.FilePath), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO);
            lucenDoc.Add(locationField);
            Field dateField = new Field("Last modified",  File.GetLastWriteTime( doc.FilePath).ToLongDateString(), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO);
            lucenDoc.Add(dateField);
            Field sizeField = new Field("Size", doc.FilePath.Length.ToString(), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO);
            lucenDoc.Add(sizeField);

            // Can not parse, index limited to meta-data
            IParser parser = PluginManager.GetParser(doc);
            if (parser == null)
            {
                write.AddDocument(lucenDoc);
                return;
            }

            //  Index content datas
            IEnumerable<DocumentChunk> chunks = parser.Parse(doc);

            foreach (DocumentChunk chunk in chunks)
            {
                Field field = new Field(chunk.Metadata,
                    chunk.Text,
                    Field.Store.NO,
                    Field.Index.ANALYZED,
                    Field.TermVector.YES);
                lucenDoc.Add(field);
            }
            write.AddDocument(lucenDoc);
        }
Beispiel #15
0
 /// <summary>
 /// Verify if the document is in the indexing list
 /// </summary>
 /// <param name="doc">document to remove</param>
 /// <returns>Return true if the document is in the indexing list</returns>
 public virtual bool Contain(DocumentIdentity doc)
 {
     return this.DocumentIdentity.Contains(doc);
 }