public async Task add_multiple_attachment_to_existing_handle_then_delete_handle() { //Upload father var fatherHandle = new DocumentHandle("father"); await _documentStoreClient.UploadAsync(TestConfig.PathToDocumentPdf, fatherHandle); await UpdateAndWaitAsync().ConfigureAwait(false); //upload attachments await _documentStoreClient.UploadAttachmentAsync(TestConfig.PathToDocumentPng, fatherHandle, "Zip", Path.GetFileName(TestConfig.PathToDocumentPng)); await _documentStoreClient.UploadAttachmentAsync(TestConfig.PathToOpenDocumentText, fatherHandle, "Zip", Path.GetFileName(TestConfig.PathToOpenDocumentText)); await UpdateAndWaitAsync().ConfigureAwait(false); await _documentStoreClient.DeleteAsync(fatherHandle); await UpdateAndWaitAsync().ConfigureAwait(false); Assert.That(_documentDescriptorCollection.AsQueryable().Count(), Is.EqualTo(0), "Attachment should be deleted."); Assert.That(_documentCollection.AsQueryable().Count(), Is.EqualTo(0), "Attachment should be deleted."); }
public async Task attachments_not_retrieve_nested_attachment() { //Upload father var fatherHandle = new DocumentHandle("father"); await _documentStoreClient.UploadAsync(TestConfig.PathToDocumentPdf, fatherHandle); // wait background projection polling await UpdateAndWaitAsync().ConfigureAwait(false); await _documentStoreClient.UploadAttachmentAsync(TestConfig.PathToDocumentPng, fatherHandle, "source", Path.GetFileName(TestConfig.PathToDocumentPng)); // wait background projection polling await UpdateAndWaitAsync().ConfigureAwait(false); await _documentStoreClient.UploadAttachmentAsync(TestConfig.PathToDocumentPng, new DocumentHandle("source_1"), "nested", Path.GetFileName(TestConfig.PathToDocumentPng)); // wait background projection polling await UpdateAndWaitAsync().ConfigureAwait(false); var attachments = await _documentStoreClient.GetAttachmentsAsync(fatherHandle); Assert.NotNull(attachments); Assert.That(attachments.Attachments.Length, Is.EqualTo(1)); Assert.That(attachments.Attachments[0].RelativePath, Is.EqualTo(Path.GetFileName(TestConfig.PathToDocumentPng))); Assert.That(attachments.Attachments[0].Handle, Is.EqualTo("http://localhost:5123/tests/documents/source_1")); }
public async Task <Boolean> Run( String pathToFile, CreatePdfImageTaskParams createPdfImageTaskParams, Func <int, Stream, Task <Boolean> > pageWriter) { String tempFileName = null; if (Passwords.Count > 0) { tempFileName = Path.Combine(Path.GetDirectoryName(pathToFile), Path.GetFileNameWithoutExtension(pathToFile) + "_decrypted.pdf"); if (Decryptor.DecryptFile(pathToFile, tempFileName, Passwords)) { pathToFile = tempFileName; } } using (var sourceStream = File.OpenRead(pathToFile)) { var settings = new MagickReadSettings { Density = new PointD(createPdfImageTaskParams.Dpi, createPdfImageTaskParams.Dpi) }; settings.FrameIndex = 0; // First page settings.FrameCount = 1; // Number of pages MagickFormat imageFormat = TranslateFormat(createPdfImageTaskParams.Format); Logger.DebugFormat("Image format is {0}", imageFormat.ToString()); using (var images = new MagickImageCollection()) { bool done = false; if (!_firstDone) { lock (LockForInitializationIssue) { if (!_firstDone) { images.Read(sourceStream, settings); done = true; } } } if (!done) { images.Read(sourceStream, settings); } var lastImage = Math.Min(createPdfImageTaskParams.FromPage - 1 + createPdfImageTaskParams.Pages, images.Count) - 1; for (int page = createPdfImageTaskParams.FromPage - 1; page <= lastImage; page++) { var image = images[page]; image.Format = imageFormat; using (var ms = new MemoryStream()) { image.Write(ms); ms.Seek(0L, SeekOrigin.Begin); await pageWriter(page + 1, ms).ConfigureAwait(false); } } } } if (!String.IsNullOrEmpty(tempFileName) && File.Exists(tempFileName)) { File.Delete(tempFileName); } return(true); }
public async Task Add_multiple_attachment_to_existing_handle() { //Upload father var fatherHandle = new DocumentHandle("father"); await _documentStoreClient.UploadAsync(TestConfig.PathToDocumentPdf, fatherHandle); await UpdateAndWaitAsync().ConfigureAwait(false); //upload attachments await _documentStoreClient.UploadAttachmentAsync(TestConfig.PathToDocumentPng, fatherHandle, "Content", Path.GetFileName(TestConfig.PathToDocumentPng)).ConfigureAwait(false); await _documentStoreClient.UploadAttachmentAsync(TestConfig.PathToOpenDocumentText, fatherHandle, "Content", Path.GetFileName(TestConfig.PathToOpenDocumentText)).ConfigureAwait(false); await UpdateAndWaitAsync().ConfigureAwait(false); var document = _documentDescriptorCollection.Find(Builders <DocumentDescriptorReadModel> .Filter.Eq("Documents", "content_1")).SingleOrDefault(); Assert.That(document, Is.Not.Null, "Document with first child handle was not find."); document = _documentDescriptorCollection.Find(Builders <DocumentDescriptorReadModel> .Filter.Eq("Documents", "content_2")).SingleOrDefault(); Assert.That(document, Is.Not.Null, "Document with second child handle was not find."); var handle = _documentDescriptorCollection.Find(Builders <DocumentDescriptorReadModel> .Filter.Eq("Documents", "father")).SingleOrDefault(); Assert.That(handle, Is.Not.Null, "Father Handle Not Find"); Assert.That(handle.Attachments.Select(a => a.Handle), Is.EquivalentTo(new[] { new Core.Model.DocumentHandle("content_1"), new Core.Model.DocumentHandle("content_2") })); }
public string Download(BlobId blobId, string folder) { if (blobId == null) { throw new ArgumentNullException(nameof(blobId)); } if (String.IsNullOrEmpty(folder)) { throw new ArgumentNullException(nameof(folder)); } if (!Directory.Exists(folder)) { throw new ArgumentException($"folder {folder} does not exists", nameof(folder)); } var descriptor = _blobDescriptorCollection.FindOneById(blobId); if (descriptor == null) { throw new ArgumentException($"Descriptor for {blobId} not found in {_blobDescriptorCollection.CollectionNamespace.FullName}"); } var localFileName = _directoryManager.GetFileNameFromBlobId(blobId); if (!File.Exists(localFileName)) { Logger.Error($"Blob {blobId} has descriptor, but blob file {localFileName} not found in the system."); throw new ArgumentException($"Blob {blobId} not found"); } var originalFileName = descriptor.FileNameWithExtension.ToString(); string destinationFileName = Path.Combine(folder, originalFileName); Int32 uniqueId = 1; while (File.Exists(destinationFileName)) { destinationFileName = Path.Combine(folder, Path.GetFileNameWithoutExtension(originalFileName) + $" ({uniqueId++})") + Path.GetExtension(originalFileName); } File.Copy(localFileName, destinationFileName); return(destinationFileName); }
private Boolean IsUnzippedHtmlFile() { var fileExtension = Path.GetExtension(_inputFileName); return(unzippedHtmlExtension.Any(s => s.Equals(fileExtension, StringComparison.OrdinalIgnoreCase))); }
protected async override Task <ProcessResult> OnPolling(Shared.Jobs.PollerJobParameters parameters, string workingFolder) { string localFile = await DownloadBlob( parameters.TenantId, parameters.JobId, parameters.FileName, workingFolder); String[] permittedExtension = null; if (parameters.All.ContainsKey("extensions")) { var extensionsPermitted = parameters.All["extensions"]; if (extensionsPermitted != "*") { permittedExtension = extensionsPermitted.Split('|'); } } var extension = Path.GetExtension(localFile); var unzippingDirectory = new DirectoryInfo(Path.Combine(workingFolder, Guid.NewGuid().ToString())).FullName; if (!Directory.Exists(unzippingDirectory)) { Directory.CreateDirectory(unzippingDirectory); } if (extension == ".zip") { //we can handle unzipping everything. ZipFile.ExtractToDirectory(localFile, unzippingDirectory); IEnumerable <String> files = Directory.EnumerateFiles(unzippingDirectory, "*.*", SearchOption.AllDirectories); Int32 uploadCount = await UploadAttachmentListToDocumentStore(parameters, permittedExtension, unzippingDirectory, files); Logger.DebugFormat("Uploaded {0} attachments", uploadCount); } else if (extension == ".eml") { using (var stream = File.Open(localFile, FileMode.Open, FileAccess.Read)) { var message = MsgReader.Mime.Message.Load(stream); var bodyPart = message.HtmlBody ?? message.TextBody; String body = ""; if (bodyPart != null) { body = bodyPart.GetBodyAsText(); } foreach (MsgReader.Mime.MessagePart attachment in message.Attachments.OfType <MsgReader.Mime.MessagePart>()) { if (!String.IsNullOrEmpty(attachment.ContentId) && body.Contains(attachment.ContentId)) { if (Logger.IsDebugEnabled) { Logger.DebugFormat("Attachment cid {0} name {1} discharded because it is inline", attachment.ContentId, attachment.FileName); continue; } } String fileName = Path.Combine(unzippingDirectory, attachment.FileName); File.WriteAllBytes(fileName, attachment.Body); await AddAttachmentToHandle( parameters.TenantId, parameters.JobId, fileName, "attachment_email", attachment.FileName, new Dictionary <string, object>() { } ); } } } else if (extension == ".msg") { using (var stream = File.Open(localFile, FileMode.Open, FileAccess.Read)) using (var message = new Storage.Message(stream)) { foreach (Storage.Attachment attachment in message.Attachments.OfType <Storage.Attachment>()) { if (attachment.IsInline) { continue; //no need to uncompress inline attqach } String fileName = Path.Combine(unzippingDirectory, attachment.FileName); File.WriteAllBytes(fileName, attachment.Data); await AddAttachmentToHandle( parameters.TenantId, parameters.JobId, fileName, "attachment_email", attachment.FileName, new Dictionary <string, object>() { } ); } } } else if (extension == ".7z" || extension == ".7zip" || extension == ".rar") { //we can handle unzipping everything. var extracted = _sevenZipExtractorFunctions.ExtractTo(localFile, unzippingDirectory); Int32 uploadCount = await UploadAttachmentListToDocumentStore(parameters, permittedExtension, unzippingDirectory, extracted); Logger.DebugFormat("Uploaded {0} attachments", uploadCount); } return(ProcessResult.Ok); }
protected async override Task <ProcessResult> OnPolling( Shared.Jobs.PollerJobParameters parameters, string workingFolder) { var client = GetDocumentStoreClient(parameters.TenantId); var handles = parameters.All["documentList"].Split('|'); var destinationHandle = parameters.All["resultingDocumentHandle"]; var destinationFileName = parameters.All["resultingDocumentFileName"]; if (!destinationFileName.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase)) { destinationFileName = Path.ChangeExtension(destinationFileName, ".pdf"); } List <FileToComposeData> files = new List <FileToComposeData>(); foreach (var handle in handles) { var documentHandle = new DocumentHandle(handle); Boolean pdfExists = false; try { pdfExists = await InnerGetPdf(workingFolder, client, files, handle, documentHandle, pdfExists); } catch (System.Net.WebException ex) { Logger.WarnFormat("Handle {0} has no PDF format", handle); } if (!pdfExists) { int requeueCount = GetRequeueCount(parameters); if (requeueCount <= 3) //first 3 times, always retry (lets DS the time to generate jobs) { return(GenerateRequeueProcessResult(requeueCount)); } //need to check if this file has some job pending that can generate pdf. var pendingJobs = await client.GetJobsAsync(documentHandle); var fileName = await GetfileNameFromHandle(client, documentHandle); Boolean needWaitForJobToRun = CheckIfSomeJobCanStillProducePdfFormat(pendingJobs, fileName, requeueCount); //need to check if queue that can convert the document are still running. We need to wait for the queue to be stable. if (needWaitForJobToRun) { return(GenerateRequeueProcessResult(requeueCount)); } else { //This file has no pdf format, mark as missing pdf. Logger.WarnFormat("Handle {0} has no pdf format, status of queue is {1}", handle, String.Join(",", pendingJobs.Select(j => String.Format("{0}[Executed:{1} Success:{2}]", j.QueueName, j.Executed, j.Success)))); files.Add(FileToComposeData.NoPdfFormat(handle, fileName)); } } } //now compose everything. PdfManipulator manipulator = new PdfManipulator(Logger); //Create a manipulator foreach (var fileToCompose in files) { String pdfFileToAppend = fileToCompose.PdfFileName; if (!fileToCompose.HasPdfFormat) { pdfFileToAppend = GeneratePlaceholderFile(workingFolder, fileToCompose.FileName, fileToCompose.DocumentHandle); } var error = manipulator.AppendDocumentAtEnd(pdfFileToAppend); if (!String.IsNullOrEmpty(error)) { throw new ApplicationException(String.Format("Unable to compose file {0} error {1}", fileToCompose.DocumentHandle, error)); } } manipulator.AddPageNumber(); String outputDirectory = Path.Combine(workingFolder, Guid.NewGuid().ToString()); Directory.CreateDirectory(outputDirectory); var finalFileName = Path.Combine(outputDirectory, destinationFileName); manipulator.Save(finalFileName); var result = await client.UploadAsync(finalFileName, new DocumentHandle(destinationHandle)); return(ProcessResult.Ok); }
private string ProcessFile(string pathToFile, string workingFolder) { var extension = Path.GetExtension(pathToFile).ToLower(); if (extension == ".htmlzip" || extension == ".htmzip") { ZipFile.ExtractToDirectory(pathToFile, workingFolder); Logger.DebugFormat("Extracted zip to {0}", workingFolder); var htmlFile = Path.ChangeExtension(pathToFile, "html"); if (File.Exists(htmlFile)) { Logger.DebugFormat("Html file is {0}", htmlFile); return(htmlFile); } htmlFile = Path.ChangeExtension(pathToFile, "htm"); if (File.Exists(htmlFile)) { Logger.DebugFormat("Html file is {0}", htmlFile); return(htmlFile); } Logger.ErrorFormat("Invalid HTMLZIP file, name is {0} but corresponding html file not found after decompression", Path.GetFileName(pathToFile)); } else if (extension == ".mht" || extension == ".mhtml") { MHTMLParser parser = new MHTMLParser(File.ReadAllText(pathToFile)); parser.OutputDirectory = workingFolder; parser.DecodeImageData = false; var html = parser.getHTMLText(); pathToFile = pathToFile + ".html"; File.WriteAllText(pathToFile, html); } return(pathToFile); }
protected async override Task <ProcessResult> OnPolling( PollerJobParameters parameters, String workingFolder) { Boolean result; var contentFileName = Path.ChangeExtension(parameters.FileName, ".content"); if (!_formats.Contains(parameters.FileExtension)) { Logger.DebugFormat("Document for job Id {0} has an extension not supported, setting null content", parameters.JobId); return(new ProcessResult(await AddNullContentFormat(parameters, contentFileName))); } Logger.DebugFormat("Starting tika on job: {0}, file extension {1}", parameters.JobId, parameters.FileExtension); Logger.DebugFormat("Downloading blob for job: {0}, on local path {1}", parameters.JobId, workingFolder); string pathToFile = await DownloadBlob(parameters.TenantId, parameters.JobId, parameters.FileName, workingFolder); pathToFile = ProcessFile(pathToFile, workingFolder); Boolean shouldAnalyze = _filterManager.ShouldAnalyze(parameters.FileName, pathToFile); if (!shouldAnalyze) { Logger.InfoFormat("File {0} for job {1} was discharded!", parameters.FileName, parameters.JobId); return(new ProcessResult(await AddNullContentFormat(parameters, contentFileName))); } Logger.DebugFormat("Search for password JobId:{0}", parameters.JobId); var passwords = ClientPasswordSet.GetPasswordFor(parameters.FileName).ToArray(); String content = ""; Int32 analyzerOrdinal = 0; Boolean success = false; var analyzer = BuildAnalyzer(analyzerOrdinal); do { try { if (passwords.Any()) { //Try with all the password foreach (var password in passwords) { try { content = analyzer.GetHtmlContent(pathToFile, password) ?? ""; break; //first password that can decrypt file break the list of password to try } catch (Exception) { Logger.ErrorFormat("Error opening file {0} with password", parameters.FileName); } } } else { //Simply analyze file without password Logger.DebugFormat("Analyze content JobId: {0} -> Path: {1}", parameters.JobId, pathToFile); content = analyzer.GetHtmlContent(pathToFile, "") ?? ""; } success = true; } catch (Exception ex) { Logger.ErrorFormat(ex, "Error extracting tika with analyzer {0} on file {1}", analyzer.Describe(), parameters.FileName, parameters.JobId); analyzer = BuildAnalyzer(++analyzerOrdinal); if (analyzer != null) { Logger.InfoFormat("Retry job {0} with analyzer {1}", parameters.JobId, analyzer.Describe()); } } } while (analyzer != null && success == false); Logger.DebugFormat("Finished tika on job: {0}, charsNum {1}", parameters.JobId, content.Count()); String sanitizedContent = content; if (!string.IsNullOrWhiteSpace(content)) { var resultContent = _builder.CreateFromTikaPlain(content); var documentContent = resultContent.Content; sanitizedContent = resultContent.SanitizedTikaContent; var pages = documentContent.Pages.Count(); string lang = null; if (pages > 1) { lang = LanguageDetector.GetLanguage(documentContent.Pages[1].Content); } if (lang == null && pages == 1) { lang = LanguageDetector.GetLanguage(documentContent.Pages[0].Content); } if (lang != null) { documentContent.AddMetadata(DocumentContent.MedatataLanguage, lang); } result = await AddFormatToDocumentFromObject( parameters.TenantId, this.QueueName, parameters.JobId, new DocumentFormat(DocumentFormats.Content), documentContent, contentFileName, new Dictionary <string, object>()); Logger.DebugFormat("Added format {0} to jobId {1}, result: {2}", DocumentFormats.Content, parameters.JobId, result); } var tikaFileName = Path.Combine(workingFolder, Path.GetFileNameWithoutExtension(parameters.FileName) + ".tika.html"); tikaFileName = SanitizeFileNameForLength(tikaFileName); File.WriteAllText(tikaFileName, sanitizedContent); result = await AddFormatToDocumentFromFile( parameters.TenantId, parameters.JobId, new DocumentFormat(DocumentFormats.Tika), tikaFileName, new Dictionary <string, object>()); Logger.DebugFormat("Added format {0} to jobId {1}, result: {2}", DocumentFormats.Tika, parameters.JobId, result); return(ProcessResult.Ok); }
public static string PathToLangFile(string lang) { var pathToFile = Path.Combine(DocumentsFolder, "lang", lang + ".txt"); return(pathToFile); }
private static string GenerateQueueFolder() { return(Path.Combine(Path.GetTempPath(), Guid.NewGuid().ToString(), "Queue")); }
public bool CanConvert(string fileName) { var extension = Path.GetExtension(fileName); return(supportedImageExtensions.Any(e => extension.EndsWith(e, StringComparison.OrdinalIgnoreCase))); }
public async Task attachments_fat_retrieve_nested_attachment() { //Upload father var fatherHandle = new DocumentHandle("father"); await _documentStoreClient.UploadAsync(TestConfig.PathToDocumentPdf, fatherHandle); // wait background projection polling await UpdateAndWaitAsync(); await _documentStoreClient.UploadAttachmentAsync(TestConfig.PathToDocumentPng, fatherHandle, "source", Path.GetFileName(TestConfig.PathToDocumentPng)); // wait background projection polling await UpdateAndWaitAsync(); await _documentStoreClient.UploadAttachmentAsync(TestConfig.PathToExcelDocument, new DocumentHandle("source_1"), "nested", Path.GetFileName(TestConfig.PathToExcelDocument)); // wait background projection polling await UpdateAndWaitAsync(); var attachments = await _documentStoreClient.GetAttachmentsFatAsync(fatherHandle); Assert.NotNull(attachments); Assert.That(attachments.Attachments, Has.Count.EqualTo(2)); Assert.That(attachments.Attachments.Select(a => a.FileName), Is.EquivalentTo(new[] { Path.GetFileName(TestConfig.PathToDocumentPng), Path.GetFileName(TestConfig.PathToExcelDocument) })); Assert.That(attachments.Attachments.Select(a => a.Uri), Is.EquivalentTo(new[] { new Uri("http://localhost:5123/tests/documents/source_1"), new Uri("http://localhost:5123/tests/documents/nested_1") })); }
public async Task can_add_attachment_to_existing_handle() { //Upload father var fatherHandle = new DocumentHandle("father"); await _documentStoreClient.UploadAsync(TestConfig.PathToDocumentPdf, fatherHandle); // wait background projection polling await UpdateAndWaitAsync(); await _documentStoreClient.UploadAttachmentAsync(TestConfig.PathToDocumentPng, fatherHandle, "Content", Path.GetFileName(TestConfig.PathToDocumentPng)); // wait background projection polling await UpdateAndWaitAsync(); var document = _documentDescriptorCollection.Find(Builders <DocumentDescriptorReadModel> .Filter.Eq("Documents", "content_1")).SingleOrDefault(); Assert.That(document, Is.Not.Null, "Document with child handle was not find."); var handle = _documentDescriptorCollection.Find(Builders <DocumentDescriptorReadModel> .Filter.Eq("Documents", "father")).SingleOrDefault(); Assert.That(handle, Is.Not.Null, "Father Handle Not Find"); Assert.That(handle.Attachments.Select(a => a.Handle), Is.EquivalentTo(new[] { new Jarvis.DocumentStore.Core.Model.DocumentHandle("content_1") })); }