public IEnumerable<DocumentChunk> Parse(DocumentIdentity document) { List<DocumentChunk> chuncks = new List<DocumentChunk>(10); using (TextReader s = File.OpenText(document.FilePath)) { string line = string.Empty; while ((line = s.ReadLine()) != null) { if (fromRegex.IsMatch(line) && emailRegex.IsMatch(line)) { foreach(DocumentChunk c in ExtractEmails(line)) { c.Metadata = "from"; chuncks.Add(c); } } if (toRegex.IsMatch(line) && emailRegex.IsMatch(line)) { foreach (DocumentChunk c in ExtractEmails(line)) { c.Metadata = "to"; chuncks.Add(c); } } } } return chuncks; }
public void TestParsePdfText() { IParser parser = new PortableDocumentFormat(); DocumentIdentity doc = new DocumentIdentity(); doc.FilePath = Path.GetFullPath(textPdfFile); var chuncks = parser.Parse(doc); Assert.IsTrue(chuncks.Count() > 0, "The PDF doc containt text"); }
/// <summary> /// Return the good parser for a document based on the extension /// </summary> /// <param name="doc">Document </param> /// <returns></returns> public IParser GetParser(DocumentIdentity doc) { IParser parser = null; Func<IParser, bool> predicate = (IParser p) => p.SupportedFileExtensions.Contains(Path.GetExtension(doc.FilePath).ToLower()); if (Parsers.Count(predicate) == 1) { parser = Parsers.Single(predicate); } return parser; }
public void TestParseNoChunks() { IParser plain = new PlainText(); DocumentIdentity doc = new DocumentIdentity() { FilePath = Path.GetFullPath(@"Lucy.Plugin.Parser\plainText_0Chunks.txt") }; IEnumerable<DocumentChunk> chunks = plain.Parse(doc); Assert.IsNotNull(chunks, "The result can be empty but not null"); Assert.IsTrue(chunks.Count() == 0, "We should have not any chunk"); }
/// <summary> /// Add a document for indexing /// </summary> /// <param name="doc">document to add</param> public virtual void Add(DocumentIdentity doc) { Contract.Assert(doc != null); Contract.Assert(doc.FilePath != null); int nbOfPath = this.DocumentIdentity.Count(p => p.FilePath == doc.FilePath); if (nbOfPath == 0) { this.DocumentIdentity.Add(doc); } }
public void TestParseEmail() { DocumentIdentity doc = new DocumentIdentity() { FilePath = Path.GetFullPath(@"Lucy.Plugin.Parser\simpleMail.eml") }; IParser plain = new PlainEml(); var r = plain.Parse(doc); Assert.IsTrue(r.Count() == 3, "must be found 3 chuncks, 1 from and 2 to"); Assert.IsTrue(r.Count(p => p.Metadata == "from") == 1, "from not found"); Assert.IsTrue(r.Count(p => p.Metadata == "to") == 2, "to not found"); }
public void ComparisonPartialyLoaded() { // ref DocumentIdentity docA = new DocumentIdentity(); docA.FilePath = Path.GetFullPath(fileA); // same file DocumentIdentity docAbis = new DocumentIdentity(); docAbis.FilePath = Path.GetFullPath(fileA); Assert.IsTrue(docA.CompareTo(docAbis) == 0, "Same files"); // File differ DocumentIdentity docB = new DocumentIdentity(); docB.FilePath = Path.GetFullPath(fileB); Assert.IsFalse(docA.CompareTo(docB) == 0, "Not same files"); }
public void TestIndexUnicity() { DocumentIdentity document1ABis = new DocumentIdentity(); document1ABis.Checksum = document1A.Checksum; document1ABis.FilePath = document1A.FilePath; document1ABis.DocumentID = document1A.DocumentID; document1ABis.State = IndexationStates.Undefined; DocumentIndex index = new DocumentIndex(IndexDir); index.Add(document1A); index.Add(document1ABis); Assert.IsTrue(index.DocumentIdentity.Count == 1, "One different document added"); document1ABis.State = IndexationStates.NotIndexed; index.Add(document1ABis); Assert.IsTrue(index.DocumentIdentity.Count == 1, "One different document added"); }
/// <summary> /// Explore a location and return individual documents /// </summary> /// <param name="location"></param> /// <returns>Document</returns> public virtual ICollection<DocumentIdentity> Discover(DocumentLocation location) { Contract.Assert(location != null); List<DocumentIdentity> documents = new List<DocumentIdentity>(); location.State = DiscoveryStates.Exploring; DirectoryInfo dir = new DirectoryInfo(location.Location); IEnumerable<FileInfo> files = dir.EnumerateFiles("*.*", SearchOption.AllDirectories); Parallel.ForEach(files, (FileInfo file) => { try { if (!file.Exists) { return; } DocumentIdentity identity = new DocumentIdentity(); identity.DocumentID = ComputeId(file.FullName); identity.Checksum = ComputeChecksum(file); identity.State = IndexationStates.NotIndexed; identity.FilePath = file.FullName; identity.LastIndexed = null; documents.Add(identity); } catch (IOException ex) { } catch (InvalidOperationException ex) { //Logger.Error("Checksum failed"); //Logger.Error(ex); } catch (UnauthorizedAccessException ex) { // Logger.Error("Not authorized to accces to" + file.FullName); // Logger.Error(ex); } }); location.LastDiscovered = DateTime.Now; location.State = DiscoveryStates.Explored; return documents; }
/// <summary> /// Parse an document /// </summary> /// <param name="document">Document to parse</param> /// <returns>Chunk of text</returns> public virtual IEnumerable<DocumentChunk> Parse(DocumentIdentity document) { List<DocumentChunk> result = new List<DocumentChunk>(); Contract.Assert(document != null); Contract.Result<IEnumerable<DocumentChunk>>(); Contract.Ensures(result != null, "Empty collection can be returned but not null reference"); using (StreamReader reader = File.OpenText(document.FilePath)) { DocumentChunk chunk = new DocumentChunk(); StringBuilder text = new StringBuilder(); while (!reader.EndOfStream) { string line = reader.ReadLine(); bool isEmptyParagraph = line.Length == 0; if (isEmptyParagraph && text.Length > 0) { chunk.Metadata = "Content"; chunk.Text = text.ToString(); result.Add(chunk); text.Clear(); chunk = new DocumentChunk(); } else { text.Append(line); } } // Post-Loop action : add remaining chunk if (text.Length > 0) { chunk.Text = text.ToString(); chunk.Metadata = "Content"; result.Add(chunk); } } return result; }
public void ComparisonChecksum() { //Ref DocumentIdentity docA = new DocumentIdentity(); docA.FilePath = Path.GetFullPath(fileA); docA.State = IndexationStates.Indexed; docA.Checksum = "AAAA"; // Same checksum DocumentIdentity docB = new DocumentIdentity(); docB.FilePath = Path.GetFullPath(fileA); docB.State = IndexationStates.Indexed; docB.Checksum = "AAAA"; Assert.IsTrue(docA.CompareTo(docB) == 0, "Same files"); // Same file, checksum differt DocumentIdentity docC = new DocumentIdentity(); docC.FilePath = Path.GetFullPath(fileA); docC.State = IndexationStates.Indexed; docC.Checksum = "CCCC"; Assert.IsFalse(docA.CompareTo(docC) == 0, "Not same files"); }
/// <summary> /// Parse an PDF document and extract the text content /// </summary> /// <param name="document">Document to analyze</param> /// <returns>One text chunk per page</returns> public IEnumerable<DocumentChunk> Parse(DocumentIdentity document) { IList<DocumentChunk> result = new List<DocumentChunk>(); Contract.Assert(document != null); Contract.Result<IEnumerable<DocumentChunk>>(); Contract.Ensures(result != null, "Empty collection can be returned but not null reference"); using (PdfReader reader = new PdfReader(File.Open(document.FilePath, FileMode.Open))) { for (int i = 1; i <= reader.NumberOfPages; i++) { DocumentChunk chunk = new DocumentChunk(); chunk.Text = PdfTextExtractor.GetTextFromPage(reader, i); chunk.Metadata = "Content"; if (!String.IsNullOrEmpty(chunk.Text)) { result.Add(chunk); } } } return result; }
/// <summary> /// Remove a document from the indexing list /// </summary> /// <param name="doc"></param> public virtual void Remove(DocumentIdentity doc) { this.Remove(doc); }
/// <summary> /// Perform document parsing and update lucen index /// </summary> /// <param name="write">Indew writer</param> /// <param name="doc">Document to index</param> private void Index(IndexWriter write, DocumentIdentity doc) { Document lucenDoc = new Lucene.Net.Documents.Document(); // Metada Field docID = new Field("ID", doc.DocumentID, Field.Store.YES, Field.Index.NO, Field.TermVector.NO); lucenDoc.Add(docID); Field docName = new Field("Name", Path.GetFileNameWithoutExtension( doc.FilePath), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO); lucenDoc.Add(docName); Field docExtention = new Field("Extention",Path.GetExtension( doc.FilePath), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO); lucenDoc.Add(docExtention); Field docCRC = new Field("Checksum", doc.Checksum, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO); lucenDoc.Add(docCRC); Field locationField = new Field("Location", Path.GetDirectoryName( doc.FilePath), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO); lucenDoc.Add(locationField); Field dateField = new Field("Last modified", File.GetLastWriteTime( doc.FilePath).ToLongDateString(), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO); lucenDoc.Add(dateField); Field sizeField = new Field("Size", doc.FilePath.Length.ToString(), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO); lucenDoc.Add(sizeField); // Can not parse, index limited to meta-data IParser parser = PluginManager.GetParser(doc); if (parser == null) { write.AddDocument(lucenDoc); return; } // Index content datas IEnumerable<DocumentChunk> chunks = parser.Parse(doc); foreach (DocumentChunk chunk in chunks) { Field field = new Field(chunk.Metadata, chunk.Text, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES); lucenDoc.Add(field); } write.AddDocument(lucenDoc); }
/// <summary> /// Verify if the document is in the indexing list /// </summary> /// <param name="doc">document to remove</param> /// <returns>Return true if the document is in the indexing list</returns> public virtual bool Contain(DocumentIdentity doc) { return this.DocumentIdentity.Contains(doc); }