示例#1
0
        public async Task add_multiple_attachment_to_existing_handle_then_delete_handle()
        {
            //Upload father
            var fatherHandle = new DocumentHandle("father");
            await _documentStoreClient.UploadAsync(TestConfig.PathToDocumentPdf, fatherHandle);

            await UpdateAndWaitAsync().ConfigureAwait(false);

            //upload attachments
            await _documentStoreClient.UploadAttachmentAsync(TestConfig.PathToDocumentPng, fatherHandle, "Zip", Path.GetFileName(TestConfig.PathToDocumentPng));

            await _documentStoreClient.UploadAttachmentAsync(TestConfig.PathToOpenDocumentText, fatherHandle, "Zip", Path.GetFileName(TestConfig.PathToOpenDocumentText));

            await UpdateAndWaitAsync().ConfigureAwait(false);

            await _documentStoreClient.DeleteAsync(fatherHandle);

            await UpdateAndWaitAsync().ConfigureAwait(false);

            Assert.That(_documentDescriptorCollection.AsQueryable().Count(), Is.EqualTo(0), "Attachment should be deleted.");
            Assert.That(_documentCollection.AsQueryable().Count(), Is.EqualTo(0), "Attachment should be deleted.");
        }
示例#2
0
        public async Task attachments_not_retrieve_nested_attachment()
        {
            //Upload father
            var fatherHandle = new DocumentHandle("father");
            await _documentStoreClient.UploadAsync(TestConfig.PathToDocumentPdf, fatherHandle);

            // wait background projection polling
            await UpdateAndWaitAsync().ConfigureAwait(false);

            await _documentStoreClient.UploadAttachmentAsync(TestConfig.PathToDocumentPng, fatherHandle, "source", Path.GetFileName(TestConfig.PathToDocumentPng));

            // wait background projection polling
            await UpdateAndWaitAsync().ConfigureAwait(false);

            await _documentStoreClient.UploadAttachmentAsync(TestConfig.PathToDocumentPng, new DocumentHandle("source_1"), "nested", Path.GetFileName(TestConfig.PathToDocumentPng));

            // wait background projection polling
            await UpdateAndWaitAsync().ConfigureAwait(false);

            var attachments = await _documentStoreClient.GetAttachmentsAsync(fatherHandle);

            Assert.NotNull(attachments);
            Assert.That(attachments.Attachments.Length, Is.EqualTo(1));
            Assert.That(attachments.Attachments[0].RelativePath, Is.EqualTo(Path.GetFileName(TestConfig.PathToDocumentPng)));
            Assert.That(attachments.Attachments[0].Handle, Is.EqualTo("http://localhost:5123/tests/documents/source_1"));
        }
        public async Task <Boolean> Run(
            String pathToFile,
            CreatePdfImageTaskParams createPdfImageTaskParams,
            Func <int, Stream, Task <Boolean> > pageWriter)
        {
            String tempFileName = null;

            if (Passwords.Count > 0)
            {
                tempFileName =
                    Path.Combine(Path.GetDirectoryName(pathToFile),
                                 Path.GetFileNameWithoutExtension(pathToFile) + "_decrypted.pdf");
                if (Decryptor.DecryptFile(pathToFile, tempFileName, Passwords))
                {
                    pathToFile = tempFileName;
                }
            }
            using (var sourceStream = File.OpenRead(pathToFile))
            {
                var settings = new MagickReadSettings
                {
                    Density = new PointD(createPdfImageTaskParams.Dpi, createPdfImageTaskParams.Dpi)
                };
                settings.FrameIndex = 0; // First page
                settings.FrameCount = 1; // Number of pages
                MagickFormat imageFormat = TranslateFormat(createPdfImageTaskParams.Format);

                Logger.DebugFormat("Image format is {0}", imageFormat.ToString());
                using (var images = new MagickImageCollection())
                {
                    bool done = false;
                    if (!_firstDone)
                    {
                        lock (LockForInitializationIssue)
                        {
                            if (!_firstDone)
                            {
                                images.Read(sourceStream, settings);
                                done = true;
                            }
                        }
                    }

                    if (!done)
                    {
                        images.Read(sourceStream, settings);
                    }

                    var lastImage =
                        Math.Min(createPdfImageTaskParams.FromPage - 1 + createPdfImageTaskParams.Pages, images.Count) -
                        1;
                    for (int page = createPdfImageTaskParams.FromPage - 1; page <= lastImage; page++)
                    {
                        var image = images[page];
                        image.Format = imageFormat;

                        using (var ms = new MemoryStream())
                        {
                            image.Write(ms);
                            ms.Seek(0L, SeekOrigin.Begin);
                            await pageWriter(page + 1, ms).ConfigureAwait(false);
                        }
                    }
                }
            }
            if (!String.IsNullOrEmpty(tempFileName) && File.Exists(tempFileName))
            {
                File.Delete(tempFileName);
            }
            return(true);
        }
示例#4
0
        public async Task Add_multiple_attachment_to_existing_handle()
        {
            //Upload father
            var fatherHandle = new DocumentHandle("father");
            await _documentStoreClient.UploadAsync(TestConfig.PathToDocumentPdf, fatherHandle);

            await UpdateAndWaitAsync().ConfigureAwait(false);

            //upload attachments
            await _documentStoreClient.UploadAttachmentAsync(TestConfig.PathToDocumentPng, fatherHandle, "Content", Path.GetFileName(TestConfig.PathToDocumentPng)).ConfigureAwait(false);

            await _documentStoreClient.UploadAttachmentAsync(TestConfig.PathToOpenDocumentText, fatherHandle, "Content", Path.GetFileName(TestConfig.PathToOpenDocumentText)).ConfigureAwait(false);

            await UpdateAndWaitAsync().ConfigureAwait(false);

            var document = _documentDescriptorCollection.Find(Builders <DocumentDescriptorReadModel> .Filter.Eq("Documents", "content_1")).SingleOrDefault();

            Assert.That(document, Is.Not.Null, "Document with first child handle was not find.");

            document = _documentDescriptorCollection.Find(Builders <DocumentDescriptorReadModel> .Filter.Eq("Documents", "content_2")).SingleOrDefault();
            Assert.That(document, Is.Not.Null, "Document with second child handle was not find.");

            var handle = _documentDescriptorCollection.Find(Builders <DocumentDescriptorReadModel> .Filter.Eq("Documents", "father")).SingleOrDefault();

            Assert.That(handle, Is.Not.Null, "Father Handle Not Find");
            Assert.That(handle.Attachments.Select(a => a.Handle), Is.EquivalentTo(new[] { new Core.Model.DocumentHandle("content_1"), new Core.Model.DocumentHandle("content_2") }));
        }
示例#5
0
        public string Download(BlobId blobId, string folder)
        {
            if (blobId == null)
            {
                throw new ArgumentNullException(nameof(blobId));
            }

            if (String.IsNullOrEmpty(folder))
            {
                throw new ArgumentNullException(nameof(folder));
            }

            if (!Directory.Exists(folder))
            {
                throw new ArgumentException($"folder {folder} does not exists", nameof(folder));
            }

            var descriptor = _blobDescriptorCollection.FindOneById(blobId);

            if (descriptor == null)
            {
                throw new ArgumentException($"Descriptor for {blobId} not found in {_blobDescriptorCollection.CollectionNamespace.FullName}");
            }

            var localFileName = _directoryManager.GetFileNameFromBlobId(blobId);

            if (!File.Exists(localFileName))
            {
                Logger.Error($"Blob {blobId} has descriptor, but blob file {localFileName} not found in the system.");
                throw new ArgumentException($"Blob {blobId} not found");
            }

            var    originalFileName    = descriptor.FileNameWithExtension.ToString();
            string destinationFileName = Path.Combine(folder, originalFileName);
            Int32  uniqueId            = 1;

            while (File.Exists(destinationFileName))
            {
                destinationFileName = Path.Combine(folder, Path.GetFileNameWithoutExtension(originalFileName) + $" ({uniqueId++})") + Path.GetExtension(originalFileName);
            }

            File.Copy(localFileName, destinationFileName);
            return(destinationFileName);
        }
示例#6
0
        private Boolean IsUnzippedHtmlFile()
        {
            var fileExtension = Path.GetExtension(_inputFileName);

            return(unzippedHtmlExtension.Any(s => s.Equals(fileExtension, StringComparison.OrdinalIgnoreCase)));
        }
示例#7
0
        protected async override Task <ProcessResult> OnPolling(Shared.Jobs.PollerJobParameters parameters, string workingFolder)
        {
            string localFile = await DownloadBlob(
                parameters.TenantId,
                parameters.JobId,
                parameters.FileName,
                workingFolder);

            String[] permittedExtension = null;
            if (parameters.All.ContainsKey("extensions"))
            {
                var extensionsPermitted = parameters.All["extensions"];
                if (extensionsPermitted != "*")
                {
                    permittedExtension = extensionsPermitted.Split('|');
                }
            }

            var extension          = Path.GetExtension(localFile);
            var unzippingDirectory = new DirectoryInfo(Path.Combine(workingFolder, Guid.NewGuid().ToString())).FullName;

            if (!Directory.Exists(unzippingDirectory))
            {
                Directory.CreateDirectory(unzippingDirectory);
            }
            if (extension == ".zip")
            {
                //we can handle unzipping everything.
                ZipFile.ExtractToDirectory(localFile, unzippingDirectory);
                IEnumerable <String> files = Directory.EnumerateFiles(unzippingDirectory, "*.*", SearchOption.AllDirectories);
                Int32 uploadCount          = await UploadAttachmentListToDocumentStore(parameters, permittedExtension, unzippingDirectory, files);

                Logger.DebugFormat("Uploaded {0} attachments", uploadCount);
            }
            else if (extension == ".eml")
            {
                using (var stream = File.Open(localFile, FileMode.Open, FileAccess.Read))
                {
                    var    message  = MsgReader.Mime.Message.Load(stream);
                    var    bodyPart = message.HtmlBody ?? message.TextBody;
                    String body     = "";
                    if (bodyPart != null)
                    {
                        body = bodyPart.GetBodyAsText();
                    }
                    foreach (MsgReader.Mime.MessagePart attachment in message.Attachments.OfType <MsgReader.Mime.MessagePart>())
                    {
                        if (!String.IsNullOrEmpty(attachment.ContentId) &&
                            body.Contains(attachment.ContentId))
                        {
                            if (Logger.IsDebugEnabled)
                            {
                                Logger.DebugFormat("Attachment cid {0} name {1} discharded because it is inline", attachment.ContentId, attachment.FileName);
                                continue;
                            }
                        }

                        String fileName = Path.Combine(unzippingDirectory, attachment.FileName);
                        File.WriteAllBytes(fileName, attachment.Body);
                        await AddAttachmentToHandle(
                            parameters.TenantId,
                            parameters.JobId,
                            fileName,
                            "attachment_email",
                            attachment.FileName,
                            new Dictionary <string, object>() { }
                            );
                    }
                }
            }
            else if (extension == ".msg")
            {
                using (var stream = File.Open(localFile, FileMode.Open, FileAccess.Read))
                    using (var message = new Storage.Message(stream))
                    {
                        foreach (Storage.Attachment attachment in message.Attachments.OfType <Storage.Attachment>())
                        {
                            if (attachment.IsInline)
                            {
                                continue; //no need to uncompress inline attqach
                            }
                            String fileName = Path.Combine(unzippingDirectory, attachment.FileName);
                            File.WriteAllBytes(fileName, attachment.Data);

                            await AddAttachmentToHandle(
                                parameters.TenantId,
                                parameters.JobId,
                                fileName,
                                "attachment_email",
                                attachment.FileName,
                                new Dictionary <string, object>() { }
                                );
                        }
                    }
            }
            else if (extension == ".7z" || extension == ".7zip" || extension == ".rar")
            {
                //we can handle unzipping everything.
                var   extracted   = _sevenZipExtractorFunctions.ExtractTo(localFile, unzippingDirectory);
                Int32 uploadCount = await UploadAttachmentListToDocumentStore(parameters, permittedExtension, unzippingDirectory, extracted);

                Logger.DebugFormat("Uploaded {0} attachments", uploadCount);
            }


            return(ProcessResult.Ok);
        }
        protected async override Task <ProcessResult> OnPolling(
            Shared.Jobs.PollerJobParameters parameters,
            string workingFolder)
        {
            var client              = GetDocumentStoreClient(parameters.TenantId);
            var handles             = parameters.All["documentList"].Split('|');
            var destinationHandle   = parameters.All["resultingDocumentHandle"];
            var destinationFileName = parameters.All["resultingDocumentFileName"];

            if (!destinationFileName.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase))
            {
                destinationFileName = Path.ChangeExtension(destinationFileName, ".pdf");
            }
            List <FileToComposeData> files = new List <FileToComposeData>();

            foreach (var handle in handles)
            {
                var     documentHandle = new DocumentHandle(handle);
                Boolean pdfExists      = false;
                try
                {
                    pdfExists = await InnerGetPdf(workingFolder, client, files, handle, documentHandle, pdfExists);
                }
                catch (System.Net.WebException ex)
                {
                    Logger.WarnFormat("Handle {0} has no PDF format", handle);
                }

                if (!pdfExists)
                {
                    int requeueCount = GetRequeueCount(parameters);
                    if (requeueCount <= 3) //first 3 times, always retry (lets DS the time to generate jobs)
                    {
                        return(GenerateRequeueProcessResult(requeueCount));
                    }

                    //need to check if this file has some job pending that can generate pdf.
                    var pendingJobs = await client.GetJobsAsync(documentHandle);

                    var fileName = await GetfileNameFromHandle(client, documentHandle);

                    Boolean needWaitForJobToRun = CheckIfSomeJobCanStillProducePdfFormat(pendingJobs, fileName, requeueCount);

                    //need to check if queue that can convert the document are still running. We need to wait for the queue to be stable.
                    if (needWaitForJobToRun)
                    {
                        return(GenerateRequeueProcessResult(requeueCount));
                    }
                    else
                    {
                        //This file has no pdf format, mark as missing pdf.
                        Logger.WarnFormat("Handle {0} has no pdf format, status of queue is {1}", handle, String.Join(",", pendingJobs.Select(j => String.Format("{0}[Executed:{1} Success:{2}]", j.QueueName, j.Executed, j.Success))));
                        files.Add(FileToComposeData.NoPdfFormat(handle, fileName));
                    }
                }
            }
            //now compose everything.
            PdfManipulator manipulator = new PdfManipulator(Logger); //Create a manipulator

            foreach (var fileToCompose in files)
            {
                String pdfFileToAppend = fileToCompose.PdfFileName;
                if (!fileToCompose.HasPdfFormat)
                {
                    pdfFileToAppend = GeneratePlaceholderFile(workingFolder, fileToCompose.FileName, fileToCompose.DocumentHandle);
                }

                var error = manipulator.AppendDocumentAtEnd(pdfFileToAppend);
                if (!String.IsNullOrEmpty(error))
                {
                    throw new ApplicationException(String.Format("Unable to compose file {0} error {1}", fileToCompose.DocumentHandle, error));
                }
            }

            manipulator.AddPageNumber();

            String outputDirectory = Path.Combine(workingFolder, Guid.NewGuid().ToString());

            Directory.CreateDirectory(outputDirectory);
            var finalFileName = Path.Combine(outputDirectory, destinationFileName);

            manipulator.Save(finalFileName);

            var result = await client.UploadAsync(finalFileName, new DocumentHandle(destinationHandle));

            return(ProcessResult.Ok);
        }
示例#9
0
        private string ProcessFile(string pathToFile, string workingFolder)
        {
            var extension = Path.GetExtension(pathToFile).ToLower();

            if (extension == ".htmlzip" || extension == ".htmzip")
            {
                ZipFile.ExtractToDirectory(pathToFile, workingFolder);
                Logger.DebugFormat("Extracted zip to {0}", workingFolder);

                var htmlFile = Path.ChangeExtension(pathToFile, "html");
                if (File.Exists(htmlFile))
                {
                    Logger.DebugFormat("Html file is {0}", htmlFile);
                    return(htmlFile);
                }

                htmlFile = Path.ChangeExtension(pathToFile, "htm");
                if (File.Exists(htmlFile))
                {
                    Logger.DebugFormat("Html file is {0}", htmlFile);
                    return(htmlFile);
                }

                Logger.ErrorFormat("Invalid HTMLZIP file, name is {0} but corresponding html file not found after decompression", Path.GetFileName(pathToFile));
            }
            else if (extension == ".mht" || extension == ".mhtml")
            {
                MHTMLParser parser = new MHTMLParser(File.ReadAllText(pathToFile));
                parser.OutputDirectory = workingFolder;
                parser.DecodeImageData = false;
                var html = parser.getHTMLText();
                pathToFile = pathToFile + ".html";
                File.WriteAllText(pathToFile, html);
            }
            return(pathToFile);
        }
示例#10
0
        protected async override Task <ProcessResult> OnPolling(
            PollerJobParameters parameters,
            String workingFolder)
        {
            Boolean result;
            var     contentFileName = Path.ChangeExtension(parameters.FileName, ".content");

            if (!_formats.Contains(parameters.FileExtension))
            {
                Logger.DebugFormat("Document for job Id {0} has an extension not supported, setting null content", parameters.JobId);
                return(new ProcessResult(await AddNullContentFormat(parameters, contentFileName)));
            }

            Logger.DebugFormat("Starting tika on job: {0}, file extension {1}", parameters.JobId, parameters.FileExtension);

            Logger.DebugFormat("Downloading blob for job: {0}, on local path {1}", parameters.JobId, workingFolder);
            string pathToFile = await DownloadBlob(parameters.TenantId, parameters.JobId, parameters.FileName, workingFolder);

            pathToFile = ProcessFile(pathToFile, workingFolder);

            Boolean shouldAnalyze = _filterManager.ShouldAnalyze(parameters.FileName, pathToFile);

            if (!shouldAnalyze)
            {
                Logger.InfoFormat("File {0} for job {1} was discharded!", parameters.FileName, parameters.JobId);
                return(new ProcessResult(await AddNullContentFormat(parameters, contentFileName)));
            }
            Logger.DebugFormat("Search for password JobId:{0}", parameters.JobId);
            var     passwords       = ClientPasswordSet.GetPasswordFor(parameters.FileName).ToArray();
            String  content         = "";
            Int32   analyzerOrdinal = 0;
            Boolean success         = false;

            var analyzer = BuildAnalyzer(analyzerOrdinal);

            do
            {
                try
                {
                    if (passwords.Any())
                    {
                        //Try with all the password
                        foreach (var password in passwords)
                        {
                            try
                            {
                                content = analyzer.GetHtmlContent(pathToFile, password) ?? "";
                                break; //first password that can decrypt file break the list of password to try
                            }
                            catch (Exception)
                            {
                                Logger.ErrorFormat("Error opening file {0} with password", parameters.FileName);
                            }
                        }
                    }
                    else
                    {
                        //Simply analyze file without password
                        Logger.DebugFormat("Analyze content JobId: {0} -> Path: {1}", parameters.JobId, pathToFile);
                        content = analyzer.GetHtmlContent(pathToFile, "") ?? "";
                    }
                    success = true;
                }
                catch (Exception ex)
                {
                    Logger.ErrorFormat(ex, "Error extracting tika with analyzer {0} on file {1}", analyzer.Describe(), parameters.FileName, parameters.JobId);
                    analyzer = BuildAnalyzer(++analyzerOrdinal);
                    if (analyzer != null)
                    {
                        Logger.InfoFormat("Retry job  {0} with analyzer {1}", parameters.JobId, analyzer.Describe());
                    }
                }
            } while (analyzer != null && success == false);

            Logger.DebugFormat("Finished tika on job: {0}, charsNum {1}", parameters.JobId, content.Count());
            String sanitizedContent = content;

            if (!string.IsNullOrWhiteSpace(content))
            {
                var resultContent   = _builder.CreateFromTikaPlain(content);
                var documentContent = resultContent.Content;
                sanitizedContent = resultContent.SanitizedTikaContent;
                var    pages = documentContent.Pages.Count();
                string lang  = null;
                if (pages > 1)
                {
                    lang = LanguageDetector.GetLanguage(documentContent.Pages[1].Content);
                }

                if (lang == null && pages == 1)
                {
                    lang = LanguageDetector.GetLanguage(documentContent.Pages[0].Content);
                }

                if (lang != null)
                {
                    documentContent.AddMetadata(DocumentContent.MedatataLanguage, lang);
                }

                result = await AddFormatToDocumentFromObject(
                    parameters.TenantId,
                    this.QueueName,
                    parameters.JobId,
                    new DocumentFormat(DocumentFormats.Content),
                    documentContent,
                    contentFileName,
                    new Dictionary <string, object>());

                Logger.DebugFormat("Added format {0} to jobId {1}, result: {2}", DocumentFormats.Content, parameters.JobId, result);
            }

            var tikaFileName = Path.Combine(workingFolder, Path.GetFileNameWithoutExtension(parameters.FileName) + ".tika.html");

            tikaFileName = SanitizeFileNameForLength(tikaFileName);
            File.WriteAllText(tikaFileName, sanitizedContent);
            result = await AddFormatToDocumentFromFile(
                parameters.TenantId,
                parameters.JobId,
                new DocumentFormat(DocumentFormats.Tika),
                tikaFileName,
                new Dictionary <string, object>());

            Logger.DebugFormat("Added format {0} to jobId {1}, result: {2}", DocumentFormats.Tika, parameters.JobId, result);

            return(ProcessResult.Ok);
        }
示例#11
0
        public static string PathToLangFile(string lang)
        {
            var pathToFile = Path.Combine(DocumentsFolder, "lang", lang + ".txt");

            return(pathToFile);
        }
示例#12
0
 private static string GenerateQueueFolder()
 {
     return(Path.Combine(Path.GetTempPath(), Guid.NewGuid().ToString(), "Queue"));
 }
示例#13
0
        public bool CanConvert(string fileName)
        {
            var extension = Path.GetExtension(fileName);

            return(supportedImageExtensions.Any(e => extension.EndsWith(e, StringComparison.OrdinalIgnoreCase)));
        }
示例#14
0
        public async Task attachments_fat_retrieve_nested_attachment()
        {
            //Upload father
            var fatherHandle = new DocumentHandle("father");
            await _documentStoreClient.UploadAsync(TestConfig.PathToDocumentPdf, fatherHandle);

            // wait background projection polling
            await UpdateAndWaitAsync();

            await _documentStoreClient.UploadAttachmentAsync(TestConfig.PathToDocumentPng, fatherHandle, "source", Path.GetFileName(TestConfig.PathToDocumentPng));

            // wait background projection polling
            await UpdateAndWaitAsync();

            await _documentStoreClient.UploadAttachmentAsync(TestConfig.PathToExcelDocument, new DocumentHandle("source_1"), "nested", Path.GetFileName(TestConfig.PathToExcelDocument));

            // wait background projection polling
            await UpdateAndWaitAsync();

            var attachments = await _documentStoreClient.GetAttachmentsFatAsync(fatherHandle);

            Assert.NotNull(attachments);
            Assert.That(attachments.Attachments, Has.Count.EqualTo(2));
            Assert.That(attachments.Attachments.Select(a => a.FileName), Is.EquivalentTo(new[] {
                Path.GetFileName(TestConfig.PathToDocumentPng),
                Path.GetFileName(TestConfig.PathToExcelDocument)
            }));
            Assert.That(attachments.Attachments.Select(a => a.Uri),
                        Is.EquivalentTo(new[] {
                new Uri("http://localhost:5123/tests/documents/source_1"),
                new Uri("http://localhost:5123/tests/documents/nested_1")
            }));
        }
示例#15
0
        public async Task can_add_attachment_to_existing_handle()
        {
            //Upload father
            var fatherHandle = new DocumentHandle("father");
            await _documentStoreClient.UploadAsync(TestConfig.PathToDocumentPdf, fatherHandle);

            // wait background projection polling
            await UpdateAndWaitAsync();

            await _documentStoreClient.UploadAttachmentAsync(TestConfig.PathToDocumentPng, fatherHandle, "Content", Path.GetFileName(TestConfig.PathToDocumentPng));

            // wait background projection polling
            await UpdateAndWaitAsync();

            var document = _documentDescriptorCollection.Find(Builders <DocumentDescriptorReadModel> .Filter.Eq("Documents", "content_1")).SingleOrDefault();

            Assert.That(document, Is.Not.Null, "Document with child handle was not find.");

            var handle = _documentDescriptorCollection.Find(Builders <DocumentDescriptorReadModel> .Filter.Eq("Documents", "father")).SingleOrDefault();

            Assert.That(handle, Is.Not.Null, "Father Handle Not Find");
            Assert.That(handle.Attachments.Select(a => a.Handle), Is.EquivalentTo(new[] {
                new Jarvis.DocumentStore.Core.Model.DocumentHandle("content_1")
            }));
        }