public void Verify_sanitize_of_single_mhtml_file() { var tempFile = Path.ChangeExtension(Path.GetTempFileName(), ".mht"); File.Copy(TestConfig.PathToMht, tempFile); string mhtml = File.ReadAllText(tempFile); MHTMLParser parser = new MHTMLParser(mhtml) { OutputDirectory = Path.GetDirectoryName(tempFile), DecodeImageData = true }; var outFile = Path.ChangeExtension(tempFile, ".html"); File.WriteAllText(outFile, parser.getHTMLText()); _sanitizer = new SafeHtmlConverter(outFile) { Logger = NullLogger.Instance }; var result = _sanitizer.Run("jobtest"); Assert.That(File.Exists(result), "Output pdf file not created"); File.Delete(result); }
string DownloadLocalCopy(String tenantId, String jobId) { Logger.DebugFormat("Downloaded {0}", _inputFileName); if (IsUnzippedHtmlFile()) { return(_inputFileName); } var workingFolder = Path.GetDirectoryName(_inputFileName); ZipFile.ExtractToDirectory(_inputFileName, workingFolder); Logger.DebugFormat("Extracted zip to {0}", workingFolder); var htmlFile = Path.ChangeExtension(_inputFileName, "html"); if (File.Exists(htmlFile)) { Logger.DebugFormat("Html file is {0}", htmlFile); return(htmlFile); } htmlFile = Path.ChangeExtension(_inputFileName, "htm"); if (File.Exists(htmlFile)) { Logger.DebugFormat("Html file is {0}", htmlFile); return(htmlFile); } var msg = string.Format("Html file not found for {0}!", jobId); Logger.Error(msg); throw new Exception(msg); }
public void Verify_preview_of_single_html_file() { var tempFile = Path.ChangeExtension(Path.GetTempFileName(), ".html"); File.Copy(TestConfig.PathToSimpleHtmlFile, tempFile); _converter = new HtmlToPdfConverterFromDiskFileOld(tempFile, _config); _converter.Logger = NullLogger.Instance; var result = _converter.Run("jobtest"); Assert.That(File.Exists(result), "Output pdf file not created"); File.Delete(result); }
protected async override Task <ProcessResult> OnPolling(PollerJobParameters parameters, string workingFolder) { string pathToFile = await DownloadBlob(parameters.TenantId, parameters.JobId, parameters.FileName, workingFolder).ConfigureAwait(false); //String fileName = Path.Combine(Path.GetDirectoryName(pathToFile), parameters.All[JobKeys.FileName]); //Logger.DebugFormat("Move blob id {0} to real filename {1}", pathToFile, fileName); //if (File.Exists(fileName)) File.Delete(fileName); //File.Copy(pathToFile, fileName); if (Logger.IsDebugEnabled) { Logger.DebugFormat("Conversion of HtmlZip to PDF: file {0}", pathToFile); } var file = pathToFile; if (pathToFile.EndsWith(".mht", StringComparison.OrdinalIgnoreCase) || pathToFile.EndsWith(".mhtml", StringComparison.OrdinalIgnoreCase)) { string mhtml = File.ReadAllText(pathToFile); MHTMLParser parser = new MHTMLParser(mhtml) { OutputDirectory = workingFolder, DecodeImageData = true }; var outFile = Path.ChangeExtension(pathToFile, ".html"); File.WriteAllText(outFile, parser.getHTMLText()); file = outFile; } var sanitizer = new SafeHtmlConverter(file) { Logger = Logger }; file = sanitizer.Run(parameters.JobId); var converter = new HtmlToPdfConverterFromDiskFile(file, base.JobsHostConfiguration) { Logger = Logger }; var pdfConvertedFileName = converter.Run(parameters.TenantId, parameters.JobId); await AddFormatToDocumentFromFile( parameters.TenantId, parameters.JobId, new DocumentFormat(DocumentFormats.Pdf), pdfConvertedFileName, new Dictionary <string, object>()).ConfigureAwait(false); return(ProcessResult.Ok); }
public void Verify_sanitize_of_single_html_file() { var tempFile = Path.ChangeExtension(Path.GetTempFileName(), ".html"); File.Copy(TestConfig.PathToSimpleHtmlFile, tempFile); _sanitizer = new SafeHtmlConverter(tempFile) { Logger = NullLogger.Instance }; var result = _sanitizer.Run("jobtest"); Assert.That(File.Exists(result), "HTML file sanitized"); File.Delete(result); }
public string Run(string sourceFile, string outType) { var outputFile = Path.ChangeExtension(sourceFile, outType); Logger.DebugFormat("UNO CONVERSION: Converting: {0} to {1}", sourceFile, outputFile); lock (LockRoot) // -> single runner (todo: more user profiles) { ConvertToPdf(sourceFile, outputFile); } if (!File.Exists(outputFile)) { throw new Exception("Conversion failed"); } return(outputFile); }
protected async override Task <ProcessResult> OnPolling(PollerJobParameters parameters, string workingFolder) { string pathToFile = await DownloadBlob(parameters.TenantId, parameters.JobId, parameters.FileName, workingFolder); if (Logger.IsDebugEnabled) { Logger.DebugFormat("Conversion of HtmlZip to PDF: file {0}", pathToFile); } var file = pathToFile; if (pathToFile.ToLower().EndsWith(".mht") || pathToFile.ToLower().EndsWith(".mhtml")) { string mhtml = File.ReadAllText(pathToFile); MHTMLParser parser = new MHTMLParser(mhtml); parser.OutputDirectory = workingFolder; parser.DecodeImageData = true; var outFile = Path.ChangeExtension(pathToFile, ".html"); File.WriteAllText(outFile, parser.getHTMLText()); file = outFile; } var converter = new HtmlToPdfConverterFromDiskFileOld(file, base.JobsHostConfiguration) { Logger = Logger }; var pdfConvertedFileName = converter.Run(parameters.JobId); await AddFormatToDocumentFromFile( parameters.TenantId, parameters.JobId, new DocumentFormat(DocumentFormats.Pdf), pdfConvertedFileName, new Dictionary <string, object>()); return(ProcessResult.Ok); }
public string Run(string sourceFile, string outType) { Logger.DebugFormat("DIRECT SOFFICE.EXE CONVERSION: Starting conversion of blobId {0} to {1}", sourceFile, outType); string pathToLibreOffice = _config.GetPathToLibreOffice(); var outputFile = Path.ChangeExtension(sourceFile, outType); string arguments = string.Format("--headless -convert-to {2} -outdir \"{0}\" \"{1}\" ", Path.GetDirectoryName(sourceFile), sourceFile, outType ); var psi = new ProcessStartInfo(pathToLibreOffice, arguments) { UseShellExecute = false, RedirectStandardError = true, RedirectStandardOutput = true, CreateNoWindow = true, WindowStyle = ProcessWindowStyle.Minimized }; Logger.DebugFormat("Command: {0} {1}", pathToLibreOffice, arguments); using (var p = Process.Start(psi)) { Logger.Debug("Process started"); p.WaitForExit(); Logger.Debug("Process ended"); } if (!File.Exists(outputFile)) { throw new Exception("Conversion failed"); } return(outputFile); }
private string ProcessFile(string pathToFile, string workingFolder) { var extension = Path.GetExtension(pathToFile).ToLower(); if (extension == ".htmlzip" || extension == ".htmzip") { ZipFile.ExtractToDirectory(pathToFile, workingFolder); Logger.DebugFormat("Extracted zip to {0}", workingFolder); var htmlFile = Path.ChangeExtension(pathToFile, "html"); if (File.Exists(htmlFile)) { Logger.DebugFormat("Html file is {0}", htmlFile); return(htmlFile); } htmlFile = Path.ChangeExtension(pathToFile, "htm"); if (File.Exists(htmlFile)) { Logger.DebugFormat("Html file is {0}", htmlFile); return(htmlFile); } Logger.ErrorFormat("Invalid HTMLZIP file, name is {0} but corresponding html file not found after decompression", Path.GetFileName(pathToFile)); } else if (extension == ".mht" || extension == ".mhtml") { MHTMLParser parser = new MHTMLParser(File.ReadAllText(pathToFile)); parser.OutputDirectory = workingFolder; parser.DecodeImageData = false; var html = parser.getHTMLText(); pathToFile = pathToFile + ".html"; File.WriteAllText(pathToFile, html); } return(pathToFile); }
protected async override Task <ProcessResult> OnPolling( Shared.Jobs.PollerJobParameters parameters, string workingFolder) { var client = GetDocumentStoreClient(parameters.TenantId); var handles = parameters.All["documentList"].Split('|'); var destinationHandle = parameters.All["resultingDocumentHandle"]; var destinationFileName = parameters.All["resultingDocumentFileName"]; if (!destinationFileName.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase)) { destinationFileName = Path.ChangeExtension(destinationFileName, ".pdf"); } List <FileToComposeData> files = new List <FileToComposeData>(); foreach (var handle in handles) { var documentHandle = new DocumentHandle(handle); Boolean pdfExists = false; try { pdfExists = await InnerGetPdf(workingFolder, client, files, handle, documentHandle, pdfExists); } catch (System.Net.WebException ex) { Logger.WarnFormat("Handle {0} has no PDF format", handle); } if (!pdfExists) { int requeueCount = GetRequeueCount(parameters); if (requeueCount <= 3) //first 3 times, always retry (lets DS the time to generate jobs) { return(GenerateRequeueProcessResult(requeueCount)); } //need to check if this file has some job pending that can generate pdf. var pendingJobs = await client.GetJobsAsync(documentHandle); var fileName = await GetfileNameFromHandle(client, documentHandle); Boolean needWaitForJobToRun = CheckIfSomeJobCanStillProducePdfFormat(pendingJobs, fileName, requeueCount); //need to check if queue that can convert the document are still running. We need to wait for the queue to be stable. if (needWaitForJobToRun) { return(GenerateRequeueProcessResult(requeueCount)); } else { //This file has no pdf format, mark as missing pdf. Logger.WarnFormat("Handle {0} has no pdf format, status of queue is {1}", handle, String.Join(",", pendingJobs.Select(j => String.Format("{0}[Executed:{1} Success:{2}]", j.QueueName, j.Executed, j.Success)))); files.Add(FileToComposeData.NoPdfFormat(handle, fileName)); } } } //now compose everything. PdfManipulator manipulator = new PdfManipulator(Logger); //Create a manipulator foreach (var fileToCompose in files) { String pdfFileToAppend = fileToCompose.PdfFileName; if (!fileToCompose.HasPdfFormat) { pdfFileToAppend = GeneratePlaceholderFile(workingFolder, fileToCompose.FileName, fileToCompose.DocumentHandle); } var error = manipulator.AppendDocumentAtEnd(pdfFileToAppend); if (!String.IsNullOrEmpty(error)) { throw new ApplicationException(String.Format("Unable to compose file {0} error {1}", fileToCompose.DocumentHandle, error)); } } manipulator.AddPageNumber(); String outputDirectory = Path.Combine(workingFolder, Guid.NewGuid().ToString()); Directory.CreateDirectory(outputDirectory); var finalFileName = Path.Combine(outputDirectory, destinationFileName); manipulator.Save(finalFileName); var result = await client.UploadAsync(finalFileName, new DocumentHandle(destinationHandle)); return(ProcessResult.Ok); }
protected async override Task <ProcessResult> OnPolling( PollerJobParameters parameters, String workingFolder) { Boolean result; var contentFileName = Path.ChangeExtension(parameters.FileName, ".content"); if (!_formats.Contains(parameters.FileExtension)) { Logger.DebugFormat("Document for job Id {0} has an extension not supported, setting null content", parameters.JobId); return(new ProcessResult(await AddNullContentFormat(parameters, contentFileName))); } Logger.DebugFormat("Starting tika on job: {0}, file extension {1}", parameters.JobId, parameters.FileExtension); Logger.DebugFormat("Downloading blob for job: {0}, on local path {1}", parameters.JobId, workingFolder); string pathToFile = await DownloadBlob(parameters.TenantId, parameters.JobId, parameters.FileName, workingFolder); pathToFile = ProcessFile(pathToFile, workingFolder); Boolean shouldAnalyze = _filterManager.ShouldAnalyze(parameters.FileName, pathToFile); if (!shouldAnalyze) { Logger.InfoFormat("File {0} for job {1} was discharded!", parameters.FileName, parameters.JobId); return(new ProcessResult(await AddNullContentFormat(parameters, contentFileName))); } Logger.DebugFormat("Search for password JobId:{0}", parameters.JobId); var passwords = ClientPasswordSet.GetPasswordFor(parameters.FileName).ToArray(); String content = ""; Int32 analyzerOrdinal = 0; Boolean success = false; var analyzer = BuildAnalyzer(analyzerOrdinal); do { try { if (passwords.Any()) { //Try with all the password foreach (var password in passwords) { try { content = analyzer.GetHtmlContent(pathToFile, password) ?? ""; break; //first password that can decrypt file break the list of password to try } catch (Exception) { Logger.ErrorFormat("Error opening file {0} with password", parameters.FileName); } } } else { //Simply analyze file without password Logger.DebugFormat("Analyze content JobId: {0} -> Path: {1}", parameters.JobId, pathToFile); content = analyzer.GetHtmlContent(pathToFile, "") ?? ""; } success = true; } catch (Exception ex) { Logger.ErrorFormat(ex, "Error extracting tika with analyzer {0} on file {1}", analyzer.Describe(), parameters.FileName, parameters.JobId); analyzer = BuildAnalyzer(++analyzerOrdinal); if (analyzer != null) { Logger.InfoFormat("Retry job {0} with analyzer {1}", parameters.JobId, analyzer.Describe()); } } } while (analyzer != null && success == false); Logger.DebugFormat("Finished tika on job: {0}, charsNum {1}", parameters.JobId, content.Count()); String sanitizedContent = content; if (!string.IsNullOrWhiteSpace(content)) { var resultContent = _builder.CreateFromTikaPlain(content); var documentContent = resultContent.Content; sanitizedContent = resultContent.SanitizedTikaContent; var pages = documentContent.Pages.Count(); string lang = null; if (pages > 1) { lang = LanguageDetector.GetLanguage(documentContent.Pages[1].Content); } if (lang == null && pages == 1) { lang = LanguageDetector.GetLanguage(documentContent.Pages[0].Content); } if (lang != null) { documentContent.AddMetadata(DocumentContent.MedatataLanguage, lang); } result = await AddFormatToDocumentFromObject( parameters.TenantId, this.QueueName, parameters.JobId, new DocumentFormat(DocumentFormats.Content), documentContent, contentFileName, new Dictionary <string, object>()); Logger.DebugFormat("Added format {0} to jobId {1}, result: {2}", DocumentFormats.Content, parameters.JobId, result); } var tikaFileName = Path.Combine(workingFolder, Path.GetFileNameWithoutExtension(parameters.FileName) + ".tika.html"); tikaFileName = SanitizeFileNameForLength(tikaFileName); File.WriteAllText(tikaFileName, sanitizedContent); result = await AddFormatToDocumentFromFile( parameters.TenantId, parameters.JobId, new DocumentFormat(DocumentFormats.Tika), tikaFileName, new Dictionary <string, object>()); Logger.DebugFormat("Added format {0} to jobId {1}, result: {2}", DocumentFormats.Tika, parameters.JobId, result); return(ProcessResult.Ok); }