public void Verify_sanitize_of_single_mhtml_file()
        {
            var tempFile = Path.ChangeExtension(Path.GetTempFileName(), ".mht");

            File.Copy(TestConfig.PathToMht, tempFile);

            string      mhtml  = File.ReadAllText(tempFile);
            MHTMLParser parser = new MHTMLParser(mhtml)
            {
                OutputDirectory = Path.GetDirectoryName(tempFile),
                DecodeImageData = true
            };
            var outFile = Path.ChangeExtension(tempFile, ".html");

            File.WriteAllText(outFile, parser.getHTMLText());

            _sanitizer = new SafeHtmlConverter(outFile)
            {
                Logger = NullLogger.Instance
            };
            var result = _sanitizer.Run("jobtest");

            Assert.That(File.Exists(result), "Output pdf file not created");
            File.Delete(result);
        }
示例#2
0
        string DownloadLocalCopy(String tenantId, String jobId)
        {
            Logger.DebugFormat("Downloaded {0}", _inputFileName);

            if (IsUnzippedHtmlFile())
            {
                return(_inputFileName);
            }

            var workingFolder = Path.GetDirectoryName(_inputFileName);

            ZipFile.ExtractToDirectory(_inputFileName, workingFolder);
            Logger.DebugFormat("Extracted zip to {0}", workingFolder);

            var htmlFile = Path.ChangeExtension(_inputFileName, "html");

            if (File.Exists(htmlFile))
            {
                Logger.DebugFormat("Html file is {0}", htmlFile);
                return(htmlFile);
            }

            htmlFile = Path.ChangeExtension(_inputFileName, "htm");
            if (File.Exists(htmlFile))
            {
                Logger.DebugFormat("Html file is {0}", htmlFile);
                return(htmlFile);
            }

            var msg = string.Format("Html file not found for {0}!", jobId);

            Logger.Error(msg);
            throw new Exception(msg);
        }
        public void Verify_preview_of_single_html_file()
        {
            var tempFile = Path.ChangeExtension(Path.GetTempFileName(), ".html");

            File.Copy(TestConfig.PathToSimpleHtmlFile, tempFile);
            _converter        = new HtmlToPdfConverterFromDiskFileOld(tempFile, _config);
            _converter.Logger = NullLogger.Instance;
            var result = _converter.Run("jobtest");

            Assert.That(File.Exists(result), "Output pdf file not created");
            File.Delete(result);
        }
        protected async override Task <ProcessResult> OnPolling(PollerJobParameters parameters, string workingFolder)
        {
            string pathToFile = await DownloadBlob(parameters.TenantId, parameters.JobId, parameters.FileName, workingFolder).ConfigureAwait(false);

            //String fileName = Path.Combine(Path.GetDirectoryName(pathToFile), parameters.All[JobKeys.FileName]);
            //Logger.DebugFormat("Move blob id {0} to real filename {1}", pathToFile, fileName);
            //if (File.Exists(fileName)) File.Delete(fileName);
            //File.Copy(pathToFile, fileName);
            if (Logger.IsDebugEnabled)
            {
                Logger.DebugFormat("Conversion of HtmlZip to PDF: file {0}", pathToFile);
            }

            var file = pathToFile;

            if (pathToFile.EndsWith(".mht", StringComparison.OrdinalIgnoreCase) || pathToFile.EndsWith(".mhtml", StringComparison.OrdinalIgnoreCase))
            {
                string      mhtml  = File.ReadAllText(pathToFile);
                MHTMLParser parser = new MHTMLParser(mhtml)
                {
                    OutputDirectory = workingFolder,
                    DecodeImageData = true
                };
                var outFile = Path.ChangeExtension(pathToFile, ".html");
                File.WriteAllText(outFile, parser.getHTMLText());
                file = outFile;
            }

            var sanitizer = new SafeHtmlConverter(file)
            {
                Logger = Logger
            };

            file = sanitizer.Run(parameters.JobId);


            var converter = new HtmlToPdfConverterFromDiskFile(file, base.JobsHostConfiguration)
            {
                Logger = Logger
            };

            var pdfConvertedFileName = converter.Run(parameters.TenantId, parameters.JobId);

            await AddFormatToDocumentFromFile(
                parameters.TenantId,
                parameters.JobId,
                new DocumentFormat(DocumentFormats.Pdf),
                pdfConvertedFileName,
                new Dictionary <string, object>()).ConfigureAwait(false);

            return(ProcessResult.Ok);
        }
        public void Verify_sanitize_of_single_html_file()
        {
            var tempFile = Path.ChangeExtension(Path.GetTempFileName(), ".html");

            File.Copy(TestConfig.PathToSimpleHtmlFile, tempFile);
            _sanitizer = new SafeHtmlConverter(tempFile)
            {
                Logger = NullLogger.Instance
            };
            var result = _sanitizer.Run("jobtest");

            Assert.That(File.Exists(result), "HTML file sanitized");
            File.Delete(result);
        }
示例#6
0
        public string Run(string sourceFile, string outType)
        {
            var outputFile = Path.ChangeExtension(sourceFile, outType);

            Logger.DebugFormat("UNO CONVERSION: Converting: {0} to {1}", sourceFile, outputFile);

            lock (LockRoot)             // -> single runner (todo: more user profiles)
            {
                ConvertToPdf(sourceFile, outputFile);
            }

            if (!File.Exists(outputFile))
            {
                throw new Exception("Conversion failed");
            }

            return(outputFile);
        }
示例#7
0
        protected async override Task <ProcessResult> OnPolling(PollerJobParameters parameters, string workingFolder)
        {
            string pathToFile = await DownloadBlob(parameters.TenantId, parameters.JobId, parameters.FileName, workingFolder);

            if (Logger.IsDebugEnabled)
            {
                Logger.DebugFormat("Conversion of HtmlZip to PDF: file {0}", pathToFile);
            }

            var file = pathToFile;

            if (pathToFile.ToLower().EndsWith(".mht") || pathToFile.ToLower().EndsWith(".mhtml"))
            {
                string      mhtml  = File.ReadAllText(pathToFile);
                MHTMLParser parser = new MHTMLParser(mhtml);
                parser.OutputDirectory = workingFolder;
                parser.DecodeImageData = true;
                var outFile = Path.ChangeExtension(pathToFile, ".html");
                File.WriteAllText(outFile, parser.getHTMLText());
                file = outFile;
            }

            var converter = new HtmlToPdfConverterFromDiskFileOld(file, base.JobsHostConfiguration)
            {
                Logger = Logger
            };

            var pdfConvertedFileName = converter.Run(parameters.JobId);

            await AddFormatToDocumentFromFile(
                parameters.TenantId,
                parameters.JobId,
                new  DocumentFormat(DocumentFormats.Pdf),
                pdfConvertedFileName,
                new Dictionary <string, object>());

            return(ProcessResult.Ok);
        }
        public string Run(string sourceFile, string outType)
        {
            Logger.DebugFormat("DIRECT SOFFICE.EXE CONVERSION: Starting conversion of blobId {0} to {1}", sourceFile, outType);
            string pathToLibreOffice = _config.GetPathToLibreOffice();
            var    outputFile        = Path.ChangeExtension(sourceFile, outType);

            string arguments = string.Format("--headless -convert-to {2} -outdir \"{0}\"  \"{1}\" ",
                                             Path.GetDirectoryName(sourceFile),
                                             sourceFile,
                                             outType
                                             );

            var psi = new ProcessStartInfo(pathToLibreOffice, arguments)
            {
                UseShellExecute        = false,
                RedirectStandardError  = true,
                RedirectStandardOutput = true,
                CreateNoWindow         = true,
                WindowStyle            = ProcessWindowStyle.Minimized
            };

            Logger.DebugFormat("Command: {0} {1}", pathToLibreOffice, arguments);

            using (var p = Process.Start(psi))
            {
                Logger.Debug("Process started");
                p.WaitForExit();
                Logger.Debug("Process ended");
            }

            if (!File.Exists(outputFile))
            {
                throw new Exception("Conversion failed");
            }

            return(outputFile);
        }
示例#9
0
        private string ProcessFile(string pathToFile, string workingFolder)
        {
            var extension = Path.GetExtension(pathToFile).ToLower();

            if (extension == ".htmlzip" || extension == ".htmzip")
            {
                ZipFile.ExtractToDirectory(pathToFile, workingFolder);
                Logger.DebugFormat("Extracted zip to {0}", workingFolder);

                var htmlFile = Path.ChangeExtension(pathToFile, "html");
                if (File.Exists(htmlFile))
                {
                    Logger.DebugFormat("Html file is {0}", htmlFile);
                    return(htmlFile);
                }

                htmlFile = Path.ChangeExtension(pathToFile, "htm");
                if (File.Exists(htmlFile))
                {
                    Logger.DebugFormat("Html file is {0}", htmlFile);
                    return(htmlFile);
                }

                Logger.ErrorFormat("Invalid HTMLZIP file, name is {0} but corresponding html file not found after decompression", Path.GetFileName(pathToFile));
            }
            else if (extension == ".mht" || extension == ".mhtml")
            {
                MHTMLParser parser = new MHTMLParser(File.ReadAllText(pathToFile));
                parser.OutputDirectory = workingFolder;
                parser.DecodeImageData = false;
                var html = parser.getHTMLText();
                pathToFile = pathToFile + ".html";
                File.WriteAllText(pathToFile, html);
            }
            return(pathToFile);
        }
        protected async override Task <ProcessResult> OnPolling(
            Shared.Jobs.PollerJobParameters parameters,
            string workingFolder)
        {
            var client              = GetDocumentStoreClient(parameters.TenantId);
            var handles             = parameters.All["documentList"].Split('|');
            var destinationHandle   = parameters.All["resultingDocumentHandle"];
            var destinationFileName = parameters.All["resultingDocumentFileName"];

            if (!destinationFileName.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase))
            {
                destinationFileName = Path.ChangeExtension(destinationFileName, ".pdf");
            }
            List <FileToComposeData> files = new List <FileToComposeData>();

            foreach (var handle in handles)
            {
                var     documentHandle = new DocumentHandle(handle);
                Boolean pdfExists      = false;
                try
                {
                    pdfExists = await InnerGetPdf(workingFolder, client, files, handle, documentHandle, pdfExists);
                }
                catch (System.Net.WebException ex)
                {
                    Logger.WarnFormat("Handle {0} has no PDF format", handle);
                }

                if (!pdfExists)
                {
                    int requeueCount = GetRequeueCount(parameters);
                    if (requeueCount <= 3) //first 3 times, always retry (lets DS the time to generate jobs)
                    {
                        return(GenerateRequeueProcessResult(requeueCount));
                    }

                    //need to check if this file has some job pending that can generate pdf.
                    var pendingJobs = await client.GetJobsAsync(documentHandle);

                    var fileName = await GetfileNameFromHandle(client, documentHandle);

                    Boolean needWaitForJobToRun = CheckIfSomeJobCanStillProducePdfFormat(pendingJobs, fileName, requeueCount);

                    //need to check if queue that can convert the document are still running. We need to wait for the queue to be stable.
                    if (needWaitForJobToRun)
                    {
                        return(GenerateRequeueProcessResult(requeueCount));
                    }
                    else
                    {
                        //This file has no pdf format, mark as missing pdf.
                        Logger.WarnFormat("Handle {0} has no pdf format, status of queue is {1}", handle, String.Join(",", pendingJobs.Select(j => String.Format("{0}[Executed:{1} Success:{2}]", j.QueueName, j.Executed, j.Success))));
                        files.Add(FileToComposeData.NoPdfFormat(handle, fileName));
                    }
                }
            }
            //now compose everything.
            PdfManipulator manipulator = new PdfManipulator(Logger); //Create a manipulator

            foreach (var fileToCompose in files)
            {
                String pdfFileToAppend = fileToCompose.PdfFileName;
                if (!fileToCompose.HasPdfFormat)
                {
                    pdfFileToAppend = GeneratePlaceholderFile(workingFolder, fileToCompose.FileName, fileToCompose.DocumentHandle);
                }

                var error = manipulator.AppendDocumentAtEnd(pdfFileToAppend);
                if (!String.IsNullOrEmpty(error))
                {
                    throw new ApplicationException(String.Format("Unable to compose file {0} error {1}", fileToCompose.DocumentHandle, error));
                }
            }

            manipulator.AddPageNumber();

            String outputDirectory = Path.Combine(workingFolder, Guid.NewGuid().ToString());

            Directory.CreateDirectory(outputDirectory);
            var finalFileName = Path.Combine(outputDirectory, destinationFileName);

            manipulator.Save(finalFileName);

            var result = await client.UploadAsync(finalFileName, new DocumentHandle(destinationHandle));

            return(ProcessResult.Ok);
        }
示例#11
0
        protected async override Task <ProcessResult> OnPolling(
            PollerJobParameters parameters,
            String workingFolder)
        {
            Boolean result;
            var     contentFileName = Path.ChangeExtension(parameters.FileName, ".content");

            if (!_formats.Contains(parameters.FileExtension))
            {
                Logger.DebugFormat("Document for job Id {0} has an extension not supported, setting null content", parameters.JobId);
                return(new ProcessResult(await AddNullContentFormat(parameters, contentFileName)));
            }

            Logger.DebugFormat("Starting tika on job: {0}, file extension {1}", parameters.JobId, parameters.FileExtension);

            Logger.DebugFormat("Downloading blob for job: {0}, on local path {1}", parameters.JobId, workingFolder);
            string pathToFile = await DownloadBlob(parameters.TenantId, parameters.JobId, parameters.FileName, workingFolder);

            pathToFile = ProcessFile(pathToFile, workingFolder);

            Boolean shouldAnalyze = _filterManager.ShouldAnalyze(parameters.FileName, pathToFile);

            if (!shouldAnalyze)
            {
                Logger.InfoFormat("File {0} for job {1} was discharded!", parameters.FileName, parameters.JobId);
                return(new ProcessResult(await AddNullContentFormat(parameters, contentFileName)));
            }
            Logger.DebugFormat("Search for password JobId:{0}", parameters.JobId);
            var     passwords       = ClientPasswordSet.GetPasswordFor(parameters.FileName).ToArray();
            String  content         = "";
            Int32   analyzerOrdinal = 0;
            Boolean success         = false;

            var analyzer = BuildAnalyzer(analyzerOrdinal);

            do
            {
                try
                {
                    if (passwords.Any())
                    {
                        //Try with all the password
                        foreach (var password in passwords)
                        {
                            try
                            {
                                content = analyzer.GetHtmlContent(pathToFile, password) ?? "";
                                break; //first password that can decrypt file break the list of password to try
                            }
                            catch (Exception)
                            {
                                Logger.ErrorFormat("Error opening file {0} with password", parameters.FileName);
                            }
                        }
                    }
                    else
                    {
                        //Simply analyze file without password
                        Logger.DebugFormat("Analyze content JobId: {0} -> Path: {1}", parameters.JobId, pathToFile);
                        content = analyzer.GetHtmlContent(pathToFile, "") ?? "";
                    }
                    success = true;
                }
                catch (Exception ex)
                {
                    Logger.ErrorFormat(ex, "Error extracting tika with analyzer {0} on file {1}", analyzer.Describe(), parameters.FileName, parameters.JobId);
                    analyzer = BuildAnalyzer(++analyzerOrdinal);
                    if (analyzer != null)
                    {
                        Logger.InfoFormat("Retry job  {0} with analyzer {1}", parameters.JobId, analyzer.Describe());
                    }
                }
            } while (analyzer != null && success == false);

            Logger.DebugFormat("Finished tika on job: {0}, charsNum {1}", parameters.JobId, content.Count());
            String sanitizedContent = content;

            if (!string.IsNullOrWhiteSpace(content))
            {
                var resultContent   = _builder.CreateFromTikaPlain(content);
                var documentContent = resultContent.Content;
                sanitizedContent = resultContent.SanitizedTikaContent;
                var    pages = documentContent.Pages.Count();
                string lang  = null;
                if (pages > 1)
                {
                    lang = LanguageDetector.GetLanguage(documentContent.Pages[1].Content);
                }

                if (lang == null && pages == 1)
                {
                    lang = LanguageDetector.GetLanguage(documentContent.Pages[0].Content);
                }

                if (lang != null)
                {
                    documentContent.AddMetadata(DocumentContent.MedatataLanguage, lang);
                }

                result = await AddFormatToDocumentFromObject(
                    parameters.TenantId,
                    this.QueueName,
                    parameters.JobId,
                    new DocumentFormat(DocumentFormats.Content),
                    documentContent,
                    contentFileName,
                    new Dictionary <string, object>());

                Logger.DebugFormat("Added format {0} to jobId {1}, result: {2}", DocumentFormats.Content, parameters.JobId, result);
            }

            var tikaFileName = Path.Combine(workingFolder, Path.GetFileNameWithoutExtension(parameters.FileName) + ".tika.html");

            tikaFileName = SanitizeFileNameForLength(tikaFileName);
            File.WriteAllText(tikaFileName, sanitizedContent);
            result = await AddFormatToDocumentFromFile(
                parameters.TenantId,
                parameters.JobId,
                new DocumentFormat(DocumentFormats.Tika),
                tikaFileName,
                new Dictionary <string, object>());

            Logger.DebugFormat("Added format {0} to jobId {1}, result: {2}", DocumentFormats.Tika, parameters.JobId, result);

            return(ProcessResult.Ok);
        }